From e5f25c231d3e3ccad8a56d509e1bb14cc072daec Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Wed, 31 Dec 2025 13:20:04 -0500 Subject: [PATCH 01/38] disk: book initial commit --- build.zig | 5 + pkg/vere/book_tests.c | 868 ++++++++++++++++++++++++++++++ pkg/vere/build.zig | 2 + pkg/vere/db/book.c | 1163 +++++++++++++++++++++++++++++++++++++++++ pkg/vere/db/book.h | 114 ++++ pkg/vere/vere.h | 1 + 6 files changed, 2153 insertions(+) create mode 100644 pkg/vere/book_tests.c create mode 100644 pkg/vere/db/book.c create mode 100644 pkg/vere/db/book.h diff --git a/build.zig b/build.zig index aa5f4afcf8..20ff2e88d3 100644 --- a/build.zig +++ b/build.zig @@ -633,6 +633,11 @@ fn buildBinary( .file = "pkg/vere/ames_tests.c", .deps = vere_test_deps, }, + .{ + .name = "book-test", + .file = "pkg/vere/book_tests.c", + .deps = vere_test_deps, + }, .{ .name = "boot-test", .file = "pkg/vere/boot_tests.c", diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c new file mode 100644 index 0000000000..2e5a0d1048 --- /dev/null +++ b/pkg/vere/book_tests.c @@ -0,0 +1,868 @@ +/// @file + +#include "db/book.h" +#include "vere.h" + +#include +#include +#include +#include +#include +#include + +/* test helpers +*/ + +/* _test_tmpdir(): create temporary test directory. +*/ +static c3_c* +_test_tmpdir(const c3_c* prefix) +{ + c3_c* tmp_c = c3_malloc(256); + snprintf(tmp_c, 256, "/tmp/%s-XXXXXX", prefix); + + if ( !mkdtemp(tmp_c) ) { + fprintf(stderr, "book_tests: failed to create temp dir\r\n"); + c3_free(tmp_c); + return 0; + } + + return tmp_c; +} + +/* _test_cleanup(): remove test directory and contents. +*/ +static void +_test_cleanup(const c3_c* dir_c) +{ + c3_c cmd_c[512]; + snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", dir_c); + system(cmd_c); +} + +/* _test_make_event(): create a fake event buffer (mug + jam data). +*/ +static void +_test_make_event(c3_y** buf_y, c3_z* siz_z, c3_d eve_d) +{ + // simple fake event: 4-byte mug + variable jam data + // mug = eve_d as 32-bit value + // jam = repeating pattern based on eve_d + + c3_w mug_w = (c3_w)eve_d; + c3_z jam_len = 16 + (eve_d % 32); // 16-48 bytes of jam data + + *siz_z = 4 + jam_len; + *buf_y = c3_malloc(*siz_z); + + memcpy(*buf_y, &mug_w, 4); + + // fill jam data with pattern + for ( c3_z i = 0; i < jam_len; i++ ) { + (*buf_y)[4 + i] = (c3_y)((eve_d + i) & 0xff); + } +} + +/* _test_verify_event(): verify event buffer matches expected. +*/ +static c3_o +_test_verify_event(c3_d eve_d, c3_z siz_z, void* buf_v) +{ + c3_y* buf_y = (c3_y*)buf_v; + c3_w mug_w; + c3_z expected_len; + + memcpy(&mug_w, buf_y, 4); + + if ( mug_w != (c3_w)eve_d ) { + fprintf(stderr, "book_tests: event %llu mug mismatch: got %u\r\n", eve_d, mug_w); + return c3n; + } + + expected_len = 16 + (eve_d % 32); + + if ( siz_z != 4 + expected_len ) { + fprintf(stderr, "book_tests: event %llu size mismatch: got %zu, expected %zu (4 + %zu)\r\n", + eve_d, siz_z, 4 + expected_len, expected_len); + return c3n; + } + + // verify jam data pattern + for ( c3_z i = 0; i < expected_len; i++ ) { + if ( buf_y[4 + i] != (c3_y)((eve_d + i) & 0xff) ) { + fprintf(stderr, "book_tests: event %llu data mismatch at offset %zu\r\n", + eve_d, i); + return c3n; + } + } + + return c3y; +} + +/* read callback context +*/ +typedef struct _read_ctx { + c3_d count; + c3_d expected_start; + c3_o failed; +} read_ctx; + +/* _test_read_cb(): callback for u3_book_read(). +*/ +static c3_o +_test_read_cb(void* ptr_v, c3_d eve_d, c3_z siz_z, void* buf_v) +{ + read_ctx* ctx = (read_ctx*)ptr_v; + + if ( eve_d != ctx->expected_start + ctx->count ) { + fprintf(stderr, "book_tests: read callback event mismatch: %llu vs %llu\r\n", + eve_d, ctx->expected_start + ctx->count); + ctx->failed = c3y; + return c3n; + } + + if ( c3n == _test_verify_event(eve_d, siz_z, buf_v) ) { + ctx->failed = c3y; + return c3n; + } + + ctx->count++; + return c3y; +} + +/* tests +*/ + +/* _test_book_init_empty(): test creating new empty log. +*/ +static c3_o +_test_book_init_empty(void) +{ + c3_c* tmp_c = _test_tmpdir("book-init"); + u3_book* log_u; + c3_d low_d, hig_d; + + if ( !tmp_c ) { + return c3n; + } + + // create new log + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: init failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify empty gulf + if ( c3n == u3_book_gulf(log_u, &low_d, &hig_d) ) { + fprintf(stderr, "book_tests: gulf failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( 0 != low_d || 0 != hig_d ) { + fprintf(stderr, "book_tests: empty gulf wrong: [%llu, %llu]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_single_event(): test writing and reading single event. +*/ +static c3_o +_test_book_single_event(void) +{ + c3_c* tmp_c = _test_tmpdir("book-single"); + u3_book* log_u; + c3_y* buf_y; + c3_z siz_z; + c3_d low_d, hig_d; + read_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // create and save event 1 + _test_make_event(&buf_y, &siz_z, 1); + + if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z) ) { + fprintf(stderr, "book_tests: save failed\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + c3_free(buf_y); + + // verify gulf + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 1 != hig_d ) { + fprintf(stderr, "book_tests: single gulf wrong: [%llu, %llu]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // read event back + ctx.expected_start = 1; + ctx.count = 0; + ctx.failed = c3n; + if ( c3n == u3_book_read(log_u, &ctx, 1, 1, _test_read_cb) ) { + fprintf(stderr, "book_tests: read failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3y == ctx.failed || 1 != ctx.count ) { + fprintf(stderr, "book_tests: read verify failed (failed=%u, count=%llu)\r\n", + ctx.failed, ctx.count); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_batch_write(): test writing batch of 100 events. +*/ +static c3_o +_test_book_batch_write(void) +{ + c3_c* tmp_c = _test_tmpdir("book-batch"); + u3_book* log_u; + void* bufs[100]; + c3_z sizes[100]; + c3_d i, low_d, hig_d; + read_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // create 100 events + for ( i = 0; i < 100; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + // write batch + if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes) ) { + fprintf(stderr, "book_tests: batch save failed\r\n"); + for ( i = 0; i < 100; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // free buffers + for ( i = 0; i < 100; i++ ) { + c3_free(bufs[i]); + } + + // verify gulf + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 100 != hig_d ) { + fprintf(stderr, "book_tests: batch gulf wrong: [%llu, %llu]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // read all events back + ctx.expected_start = 1; + ctx.count = 0; + ctx.failed = c3n; + if ( c3n == u3_book_read(log_u, &ctx, 1, 100, _test_read_cb) ) { + fprintf(stderr, "book_tests: batch read failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3y == ctx.failed || 100 != ctx.count ) { + fprintf(stderr, "book_tests: batch read verify failed (failed=%u, count=%llu)\r\n", + ctx.failed, ctx.count); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_persistence(): test closing and reopening log. +*/ +static c3_o +_test_book_persistence(void) +{ + c3_c* tmp_c = _test_tmpdir("book-persist"); + u3_book* log_u; + void* bufs[50]; + c3_z sizes[50]; + c3_d i, low_d, hig_d; + read_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + // write 50 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes) ) { + fprintf(stderr, "book_tests: persist save failed\r\n"); + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // reopen and verify + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: persist reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 50 != hig_d ) { + fprintf(stderr, "book_tests: persist gulf wrong: [%llu, %llu]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // read all events + ctx.expected_start = 1; + ctx.count = 0; + ctx.failed = c3n; + if ( c3n == u3_book_read(log_u, &ctx, 1, 50, _test_read_cb) ) { + fprintf(stderr, "book_tests: persist read failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3y == ctx.failed || 50 != ctx.count ) { + fprintf(stderr, "book_tests: persist verify failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_contiguity(): test that non-contiguous writes fail. +*/ +static c3_o +_test_book_contiguity(void) +{ + c3_c* tmp_c = _test_tmpdir("book-contig"); + u3_book* log_u; + c3_y* buf_y; + c3_z siz_z; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // write event 1 + _test_make_event(&buf_y, &siz_z, 1); + if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z) ) { + fprintf(stderr, "book_tests: contig save 1 failed\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + c3_free(buf_y); + + // try to write event 3 (should fail - gap) + _test_make_event(&buf_y, &siz_z, 3); + if ( c3y == u3_book_save(log_u, 3, 1, (void**)&buf_y, &siz_z) ) { + fprintf(stderr, "book_tests: contig should have failed for gap\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + c3_free(buf_y); + + // write event 2 (should succeed) + _test_make_event(&buf_y, &siz_z, 2); + if ( c3n == u3_book_save(log_u, 2, 1, (void**)&buf_y, &siz_z) ) { + fprintf(stderr, "book_tests: contig save 2 failed\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + c3_free(buf_y); + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_partial_read(): test reading subset of events. +*/ +static c3_o +_test_book_partial_read(void) +{ + c3_c* tmp_c = _test_tmpdir("book-partial"); + u3_book* log_u; + void* bufs[100]; + c3_z sizes[100]; + c3_d i; + read_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // write 100 events + for ( i = 0; i < 100; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes) ) { + fprintf(stderr, "book_tests: partial save failed\r\n"); + for ( i = 0; i < 100; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 100; i++ ) { + c3_free(bufs[i]); + } + + // read events 50-75 + ctx.expected_start = 50; + ctx.count = 0; + ctx.failed = c3n; + if ( c3n == u3_book_read(log_u, &ctx, 50, 26, _test_read_cb) ) { + fprintf(stderr, "book_tests: partial read failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3y == ctx.failed || 26 != ctx.count ) { + fprintf(stderr, "book_tests: partial verify failed: count=%llu\r\n", + ctx.count); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_iterator(): test walk iterator pattern. +*/ +static c3_o +_test_book_iterator(void) +{ + c3_c* tmp_c = _test_tmpdir("book-iter"); + u3_book* log_u; + u3_book_walk itr_u; + void* bufs[50]; + c3_z sizes[50]; + c3_d i; + c3_z len_z; + void* buf_v; + c3_d count = 0; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // write 50 events + for ( i = 0; i < 50; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes) ) { + fprintf(stderr, "book_tests: iter save failed\r\n"); + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + + // iterate events 10-30 + if ( c3n == u3_book_walk_init(log_u, &itr_u, 10, 30) ) { + fprintf(stderr, "book_tests: walk_init failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + while ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { + c3_d expected_eve = 10 + count; + + if ( c3n == _test_verify_event(expected_eve, len_z, buf_v) ) { + fprintf(stderr, "book_tests: iter verify failed at %llu\r\n", count); + c3_free(buf_v); + u3_book_walk_done(&itr_u); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + c3_free(buf_v); + count++; + } + + if ( 21 != count ) { // events 10-30 inclusive = 21 events + fprintf(stderr, "book_tests: iter count wrong: %llu\r\n", count); + u3_book_walk_done(&itr_u); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_walk_done(&itr_u); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* metadata callback context +*/ +typedef struct _meta_ctx { + c3_o found; + c3_z len_z; + c3_y buf_y[256]; +} meta_ctx; + +/* _test_meta_cb(): callback for u3_book_read_meta(). +*/ +static void +_test_meta_cb(void* ptr_v, c3_zs len_zs, void* val_v) +{ + meta_ctx* ctx = (meta_ctx*)ptr_v; + + if ( len_zs < 0 ) { + ctx->found = c3n; + ctx->len_z = 0; + return; + } + + ctx->found = c3y; + ctx->len_z = len_zs; + if ( len_zs > 0 && len_zs <= 256 ) { + memcpy(ctx->buf_y, val_v, len_zs); + } +} + +/* _test_book_metadata(): test metadata read/write operations. +*/ +static c3_o +_test_book_metadata(void) +{ + c3_c* tmp_c = _test_tmpdir("book-meta"); + u3_book* log_u; + meta_ctx ctx = {0}; + c3_w version = 1; + c3_y who[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + c3_o fake = c3y; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // write metadata + if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { + fprintf(stderr, "book_tests: meta save version failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3n == u3_book_save_meta(log_u, "who", sizeof(who), who) ) { + fprintf(stderr, "book_tests: meta save who failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3n == u3_book_save_meta(log_u, "fake", sizeof(fake), &fake) ) { + fprintf(stderr, "book_tests: meta save fake failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // read metadata back + ctx.found = c3n; + u3_book_read_meta(log_u, &ctx, "version", _test_meta_cb); + if ( c3n == ctx.found || ctx.len_z != sizeof(version) ) { + fprintf(stderr, "book_tests: meta read version failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + if ( memcmp(ctx.buf_y, &version, sizeof(version)) != 0 ) { + fprintf(stderr, "book_tests: meta version mismatch\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + ctx.found = c3n; + u3_book_read_meta(log_u, &ctx, "who", _test_meta_cb); + if ( c3n == ctx.found || ctx.len_z != sizeof(who) ) { + fprintf(stderr, "book_tests: meta read who failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + if ( memcmp(ctx.buf_y, who, sizeof(who)) != 0 ) { + fprintf(stderr, "book_tests: meta who mismatch\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // read non-existent key + ctx.found = c3y; + u3_book_read_meta(log_u, &ctx, "nonexistent", _test_meta_cb); + if ( c3y == ctx.found ) { + fprintf(stderr, "book_tests: meta read nonexistent should fail\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // update existing key + version = 2; + if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { + fprintf(stderr, "book_tests: meta update version failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + ctx.found = c3n; + u3_book_read_meta(log_u, &ctx, "version", _test_meta_cb); + if ( c3n == ctx.found || ctx.len_z != sizeof(version) ) { + fprintf(stderr, "book_tests: meta read updated version failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + if ( memcmp(ctx.buf_y, &version, sizeof(version)) != 0 ) { + fprintf(stderr, "book_tests: meta updated version mismatch\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_core(): run all core book tests. +*/ +static c3_o +_test_book_core(void) +{ + c3_o ret = c3y; + + if ( c3n == _test_book_init_empty() ) { + fprintf(stderr, "book_tests: init_empty failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_single_event() ) { + fprintf(stderr, "book_tests: single_event failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_batch_write() ) { + fprintf(stderr, "book_tests: batch_write failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_persistence() ) { + fprintf(stderr, "book_tests: persistence failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_contiguity() ) { + fprintf(stderr, "book_tests: contiguity failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_partial_read() ) { + fprintf(stderr, "book_tests: partial_read failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_iterator() ) { + fprintf(stderr, "book_tests: iterator failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_metadata() ) { + fprintf(stderr, "book_tests: metadata failed\r\n"); + ret = c3n; + } + + return ret; +} + +/* main +*/ +int +main(int argc, char* argv[]) +{ + if ( c3n == _test_book_core() ) { + fprintf(stderr, "book tests failed\r\n"); + return 1; + } + + fprintf(stderr, "test book: ok\n"); + return 0; +} diff --git a/pkg/vere/build.zig b/pkg/vere/build.zig index 836fb15221..afa5c004ce 100644 --- a/pkg/vere/build.zig +++ b/pkg/vere/build.zig @@ -220,6 +220,7 @@ const c_source_files = [_][]const u8{ "ca_bundle/ca_bundle.c", "dawn.c", "db/lmdb.c", + "db/book.c", "disk.c", "foil.c", "io/ames.c", @@ -250,6 +251,7 @@ const c_source_files = [_][]const u8{ const install_headers = [_][]const u8{ "db/lmdb.h", + "db/book.h", "dns_sd.h", "io/ames/stun.h", "io/lss.h", diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c new file mode 100644 index 0000000000..4b10a21eb9 --- /dev/null +++ b/pkg/vere/db/book.c @@ -0,0 +1,1163 @@ +/// @file + +#include "db/book.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "c3/c3.h" +#include "noun.h" + +// book: append-only event log +// +// simple persistence layer replacing LMDB for event storage. +// optimized for sequential writes and reads, no random access. +// +// file format: +// [64-byte header] +// [events: len_d | mug_l | jam_data | crc_m | let_d] +// [metadata section] +// + +/* constants +*/ + #define BOOK_MAGIC 0x424f4f4b // "BOOK" + #define BOOK_VERSION 1 // format version + +/* _book_crc32(): compute CRC32 checksum. +*/ +static c3_w +_book_crc32(c3_y* buf_y, c3_w len_w) +{ + return (c3_w)crc32(0L, buf_y, len_w); +} + +/* _book_crc32_two(): compute CRC32 over two buffers. +*/ +static c3_w +_book_crc32_two(c3_y* one_y, c3_w one_w, c3_y* two_y, c3_w two_w) +{ + c3_w crc_w = (c3_w)crc32(0L, one_y, one_w); + return (c3_w)crc32(crc_w, two_y, two_w); +} + +/* _book_write_header(): write header to file at offset 0. +*/ +static c3_o +_book_write_header(u3_book* log_u) +{ + c3_zs ret_zs; + + ret_zs = pwrite(log_u->fid_i, &log_u->hed_u, + sizeof(u3_book_head), 0); + + if ( ret_zs != sizeof(u3_book_head) ) { + fprintf(stderr, "book: failed to write header: %s\r\n", + strerror(errno)); + return c3n; + } + + if ( -1 == c3_sync(log_u->fid_i) ) { + fprintf(stderr, "book: failed to sync header: %s\r\n", + strerror(errno)); + return c3n; + } + + log_u->dit_o = c3n; + return c3y; +} + +/* _book_read_header(): read and validate header. +*/ +static c3_o +_book_read_header(u3_book* log_u) +{ + c3_zs ret_zs; + + ret_zs = pread(log_u->fid_i, &log_u->hed_u, + sizeof(u3_book_head), 0); + + if ( ret_zs != sizeof(u3_book_head) ) { + fprintf(stderr, "book: failed to read header\r\n"); + return c3n; + } + + if ( BOOK_MAGIC != log_u->hed_u.mag_w ) { + fprintf(stderr, "book: invalid magic: 0x%08x\r\n", + log_u->hed_u.mag_w); + return c3n; + } + + if ( BOOK_VERSION != log_u->hed_u.ver_w ) { + fprintf(stderr, "book: unsupported version: %u\r\n", + log_u->hed_u.ver_w); + return c3n; + } + + return c3y; +} + +/* _book_init_header(): initialize header for new file. +*/ +static void +_book_init_header(u3_book* log_u) +{ + memset(&log_u->hed_u, 0, sizeof(u3_book_head)); + log_u->hed_u.mag_w = BOOK_MAGIC; + log_u->hed_u.ver_w = BOOK_VERSION; + log_u->hed_u.fir_d = 0; + log_u->hed_u.las_d = 0; + log_u->hed_u.off_w = 0; + log_u->hed_u.len_w = 0; + log_u->dit_o = c3y; +} + +/* _book_read_record(): read event record at offset. +** +** returns: +** c3y: success, buffers allocated +** c3n: failure (EOF or corruption) +** +** on success, caller must free *mug_y and *jam_y +*/ +static c3_o +_book_read_record(c3_i fid_i, + c3_w* off_w, + c3_d* len_d, + c3_y** mug_y, + c3_y** jam_y, + c3_w* crc_w, + c3_d* let_d) +{ + c3_zs ret_zs; + c3_w off_now = *off_w; + + // read len_d (8 bytes) + ret_zs = pread(fid_i, len_d, sizeof(c3_d), off_now); + if ( ret_zs != sizeof(c3_d) ) { + return c3n; + } + off_now += sizeof(c3_d); + + // validate length + if ( 0 == *len_d || (1ULL << 32) < *len_d ) { + fprintf(stderr, "book: invalid length: %llu\r\n", *len_d); + return c3n; + } + + // read mug_l (4 bytes) + *mug_y = c3_malloc(4); + ret_zs = pread(fid_i, *mug_y, 4, off_now); + if ( ret_zs != 4 ) { + c3_free(*mug_y); + return c3n; + } + off_now += 4; + + // read jam data (len_d - 4 bytes, since len_d includes mug) + c3_d jam_len = *len_d - 4; + *jam_y = c3_malloc(jam_len); + ret_zs = pread(fid_i, *jam_y, jam_len, off_now); + if ( ret_zs != (c3_zs)jam_len ) { + c3_free(*mug_y); + c3_free(*jam_y); + return c3n; + } + off_now += jam_len; + + // read crc_m (4 bytes) + ret_zs = pread(fid_i, crc_w, sizeof(c3_w), off_now); + if ( ret_zs != sizeof(c3_w) ) { + c3_free(*mug_y); + c3_free(*jam_y); + return c3n; + } + off_now += sizeof(c3_w); + + // read let_d (8 bytes) + ret_zs = pread(fid_i, let_d, sizeof(c3_d), off_now); + if ( ret_zs != sizeof(c3_d) ) { + c3_free(*mug_y); + c3_free(*jam_y); + return c3n; + } + off_now += sizeof(c3_d); + + // update offset + *off_w = off_now; + + return c3y; +} + +/* _book_scan_end(): scan to find actual end of valid events. +** +** validates each record's CRC and len_d == let_d. +** returns offset to append next event. +** updates header if corruption detected. +*/ +static c3_w +_book_scan_end(u3_book* log_u) +{ + c3_w off_w = sizeof(u3_book_head); + c3_d count_d = 0; + c3_d expected_d; + + if ( 0 == log_u->hed_u.fir_d && 0 == log_u->hed_u.las_d ) { + // empty log + return off_w; + } + + expected_d = log_u->hed_u.las_d - log_u->hed_u.fir_d + 1; + + while ( 1 ) { + c3_d len_d, let_d; + c3_y* mug_y; + c3_y* jam_y; + c3_w crc_w, calc_crc; + c3_w off_start = off_w; + + if ( c3n == _book_read_record(log_u->fid_i, &off_w, + &len_d, &mug_y, &jam_y, + &crc_w, &let_d) ) + { + // EOF or read error + break; + } + + // validate len_d == let_d + if ( len_d != let_d ) { + fprintf(stderr, "book: length mismatch at offset %u\r\n", off_start); + c3_free(mug_y); + c3_free(jam_y); + break; + } + + // validate CRC: CRC32(len_d || mug || jam) + { + c3_y buf_y[12]; // 8 bytes len_d + 4 bytes mug + memcpy(buf_y, &len_d, 8); + memcpy(buf_y + 8, mug_y, 4); + + calc_crc = _book_crc32(buf_y, 12); + calc_crc = (c3_w)crc32(calc_crc, jam_y, len_d - 4); + } + + c3_free(mug_y); + c3_free(jam_y); + + if ( crc_w != calc_crc ) { + fprintf(stderr, "book: CRC mismatch at offset %u\r\n", off_start); + break; + } + + count_d++; + } + + // check if we found fewer events than expected + if ( count_d != expected_d ) { + fprintf(stderr, "book: recovery: found %llu events, expected %llu\r\n", + count_d, expected_d); + + // update header + if ( count_d == 0 ) { + log_u->hed_u.fir_d = 0; + log_u->hed_u.las_d = 0; + off_w = sizeof(u3_book_head); + } else { + log_u->hed_u.las_d = log_u->hed_u.fir_d + count_d - 1; + } + + log_u->dit_o = c3y; + _book_write_header(log_u); + + // truncate file + if ( -1 == ftruncate(log_u->fid_i, off_w) ) { + fprintf(stderr, "book: failed to truncate: %s\r\n", + strerror(errno)); + } else { + c3_sync(log_u->fid_i); + } + } + + return off_w; +} + +/* u3_book_init(): open/create event log. +*/ +u3_book* +u3_book_init(const c3_c* pax_c) +{ + c3_c path_c[8193]; + c3_i fid_i; + struct stat buf_u; + u3_book* log_u; + + // construct path to event.log + snprintf(path_c, sizeof(path_c), "%s/event.log", pax_c); + + // open or create file + fid_i = c3_open(path_c, O_RDWR | O_CREAT, 0644); + if ( 0 > fid_i ) { + fprintf(stderr, "book: failed to open %s: %s\r\n", + path_c, strerror(errno)); + return 0; + } + + // get file size + if ( 0 > fstat(fid_i, &buf_u) ) { + fprintf(stderr, "book: fstat failed: %s\r\n", strerror(errno)); + close(fid_i); + return 0; + } + + // allocate log structure + log_u = c3_calloc(sizeof(u3_book)); + log_u->fid_i = fid_i; + log_u->pax_c = c3_malloc(strlen(path_c) + 1); + strcpy(log_u->pax_c, path_c); + + if ( buf_u.st_size == 0 ) { + // new file: initialize header + _book_init_header(log_u); + _book_write_header(log_u); + log_u->off_w = sizeof(u3_book_head); + } + else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { + // corrupt file: too small + fprintf(stderr, "book: file too small: %lld bytes\r\n", + (long long)buf_u.st_size); + close(fid_i); + c3_free(log_u->pax_c); + c3_free(log_u); + return 0; + } + else { + // existing file: read and validate header + if ( c3n == _book_read_header(log_u) ) { + close(fid_i); + c3_free(log_u->pax_c); + c3_free(log_u); + return 0; + } + + // scan to find actual end, recover from corruption + log_u->off_w = _book_scan_end(log_u); + } + + return log_u; +} + +/* u3_book_exit(): close event log. +*/ +void +u3_book_exit(u3_book* log_u) +{ + if ( !log_u ) { + return; + } + + // sync header if dirty + if ( c3y == log_u->dit_o ) { + _book_write_header(log_u); + } + + // close file + close(log_u->fid_i); + + // free resources + c3_free(log_u->pax_c); + c3_free(log_u); +} + +/* u3_book_gulf(): read first and last event numbers. +*/ +c3_o +u3_book_gulf(u3_book* log_u, c3_d* low_d, c3_d* hig_d) +{ + if ( !log_u ) { + return c3n; + } + + *low_d = log_u->hed_u.fir_d; + *hig_d = log_u->hed_u.las_d; + + return c3y; +} + +/* u3_book_stat(): print book statistics. +*/ +void +u3_book_stat(const c3_c* pax_c) +{ + u3_book* log_u = u3_book_init(pax_c); + struct stat buf_u; + + if ( !log_u ) { + fprintf(stderr, "book: failed to open for stats\r\n"); + return; + } + + if ( fstat(log_u->fid_i, &buf_u) < 0 ) { + fprintf(stderr, "book: fstat failed\r\n"); + u3_book_exit(log_u); + return; + } + + fprintf(stderr, "book info:\r\n"); + fprintf(stderr, " file: %s\r\n", log_u->pax_c); + fprintf(stderr, " version: %u\r\n", log_u->hed_u.ver_w); + fprintf(stderr, " first event: %llu\r\n", log_u->hed_u.fir_d); + fprintf(stderr, " last event: %llu\r\n", log_u->hed_u.las_d); + fprintf(stderr, " event count: %llu\r\n", + (0 == log_u->hed_u.las_d ) ? 0 : + (log_u->hed_u.las_d - log_u->hed_u.fir_d + 1)); + fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); + fprintf(stderr, " metadata offset: %u\r\n", log_u->hed_u.off_w); + fprintf(stderr, " metadata length: %u\r\n", log_u->hed_u.len_w); + + u3_book_exit(log_u); +} + +/* u3_book_save(): save [len_d] events starting at [eve_d]. +** +** byt_p: array of buffers (mug + jam format) +** siz_i: array of buffer sizes +*/ +c3_o +u3_book_save(u3_book* log_u, + c3_d eve_d, + c3_d len_d, + void** byt_p, + c3_z* siz_i) +{ + c3_w i; + c3_w off_now; + + if ( !log_u ) { + return c3n; + } + + // validate contiguity + if ( 0 == log_u->hed_u.las_d ) { + // empty log: first event must be 1 + if ( 1 != eve_d ) { + fprintf(stderr, "book: first event must be 1, got %llu\r\n", eve_d); + return c3n; + } + log_u->hed_u.fir_d = eve_d; + } + else { + // non-empty: must be contiguous + if ( eve_d != log_u->hed_u.las_d + 1 ) { + fprintf(stderr, "book: event gap: expected %llu, got %llu\r\n", + log_u->hed_u.las_d + 1, eve_d); + return c3n; + } + } + + // write each event record + off_now = log_u->off_w; + + for ( i = 0; i < len_d; i++ ) { + c3_y* buf_y = (c3_y*)byt_p[i]; + c3_d siz_d = (c3_d)siz_i[i]; + c3_d len_write; + c3_l mug_l; + c3_y* jam_y; + c3_d jam_len; + c3_w crc_w; + c3_zs ret_zs; + c3_y len_buf[8]; + c3_y crc_buf[4]; + c3_y let_buf[8]; + + // extract mug from buffer (first 4 bytes) + if ( siz_d < 4 ) { + fprintf(stderr, "book: event %llu buffer too small: %llu\r\n", + eve_d + i, siz_d); + return c3n; + } + + memcpy(&mug_l, buf_y, 4); + jam_y = buf_y + 4; + jam_len = siz_d - 4; + + // len_d is total payload: 4 bytes mug + jam data + len_write = siz_d; + + // compute CRC32 over: len_d (8 bytes) + mug_l (4 bytes) + jam data + { + c3_y tmp_buf[12]; + memcpy(tmp_buf, &len_write, 8); + memcpy(tmp_buf + 8, &mug_l, 4); + crc_w = _book_crc32_two(tmp_buf, 12, jam_y, jam_len); + } + + // prepare buffers for writing + memcpy(len_buf, &len_write, 8); + memcpy(crc_buf, &crc_w, 4); + memcpy(let_buf, &len_write, 8); + + // write record: len_d | mug_l | jam | crc_m | let_d + ret_zs = pwrite(log_u->fid_i, len_buf, 8, off_now); + if ( ret_zs != 8 ) { + fprintf(stderr, "book: failed to write len_d for event %llu: %s\r\n", + eve_d + i, strerror(errno)); + return c3n; + } + off_now += 8; + + ret_zs = pwrite(log_u->fid_i, &mug_l, 4, off_now); + if ( ret_zs != 4 ) { + fprintf(stderr, "book: failed to write mug for event %llu: %s\r\n", + eve_d + i, strerror(errno)); + return c3n; + } + off_now += 4; + + ret_zs = pwrite(log_u->fid_i, jam_y, jam_len, off_now); + if ( ret_zs != (c3_zs)jam_len ) { + fprintf(stderr, "book: failed to write jam for event %llu: %s\r\n", + eve_d + i, strerror(errno)); + return c3n; + } + off_now += jam_len; + + ret_zs = pwrite(log_u->fid_i, crc_buf, 4, off_now); + if ( ret_zs != 4 ) { + fprintf(stderr, "book: failed to write crc for event %llu: %s\r\n", + eve_d + i, strerror(errno)); + return c3n; + } + off_now += 4; + + ret_zs = pwrite(log_u->fid_i, let_buf, 8, off_now); + if ( ret_zs != 8 ) { + fprintf(stderr, "book: failed to write let_d for event %llu: %s\r\n", + eve_d + i, strerror(errno)); + return c3n; + } + off_now += 8; + } + + // sync data to disk + if ( -1 == c3_sync(log_u->fid_i) ) { + fprintf(stderr, "book: failed to sync events: %s\r\n", + strerror(errno)); + return c3n; + } + + // update header + log_u->hed_u.las_d = eve_d + len_d - 1; + log_u->off_w = off_now; + log_u->dit_o = c3y; + + // write and sync header + if ( c3n == _book_write_header(log_u) ) { + return c3n; + } + + return c3y; +} + +/* u3_book_read(): read [len_d] events starting at [eve_d]. +** +** invokes callback for each event with: +** ptr_v: context pointer +** eve_d: event number +** len_i: buffer size (mug + jam) +** buf_v: buffer pointer (mug + jam format) +*/ +c3_o +u3_book_read(u3_book* log_u, + void* ptr_v, + c3_d eve_d, + c3_d len_d, + c3_o (*read_f)(void*, c3_d, c3_z, void*)) +{ + c3_w off_w; + c3_d cur_d; + c3_d i; + + if ( !log_u ) { + return c3n; + } + + // validate range + if ( 0 == log_u->hed_u.las_d ) { + // empty log + fprintf(stderr, "book: read from empty log\r\n"); + return c3n; + } + + if ( eve_d < log_u->hed_u.fir_d || eve_d > log_u->hed_u.las_d ) { + fprintf(stderr, "book: event %llu out of range [%llu, %llu]\r\n", + eve_d, log_u->hed_u.fir_d, log_u->hed_u.las_d); + return c3n; + } + + if ( eve_d + len_d - 1 > log_u->hed_u.las_d ) { + fprintf(stderr, "book: read range exceeds last event\r\n"); + return c3n; + } + + // scan to starting event + off_w = sizeof(u3_book_head); + cur_d = log_u->hed_u.fir_d; + + while ( cur_d < eve_d ) { + c3_d skip_len; + c3_zs ret_zs; + + ret_zs = pread(log_u->fid_i, &skip_len, sizeof(c3_d), off_w); + if ( ret_zs != sizeof(c3_d) ) { + fprintf(stderr, "book: failed to scan to event %llu\r\n", eve_d); + return c3n; + } + + // skip entire record: len_d(8) + mug(4) + jam(len-4) + crc(4) + let_d(8) + off_w += 8 + 4 + (skip_len - 4) + 4 + 8; + cur_d++; + } + + // read requested events + for ( i = 0; i < len_d; i++, cur_d++ ) { + c3_d len_rec; + c3_y* mug_y; + c3_y* jam_y; + c3_w crc_w, calc_crc; + c3_d let_d; + c3_y* buf_y; + c3_z len_z; + + // read record + if ( c3n == _book_read_record(log_u->fid_i, &off_w, + &len_rec, &mug_y, &jam_y, + &crc_w, &let_d) ) + { + fprintf(stderr, "book: failed to read event %llu\r\n", cur_d); + return c3n; + } + + // validate len_d == let_d + if ( len_rec != let_d ) { + fprintf(stderr, "book: length mismatch at event %llu\r\n", cur_d); + c3_free(mug_y); + c3_free(jam_y); + return c3n; + } + + // validate CRC + { + c3_y tmp_buf[12]; + memcpy(tmp_buf, &len_rec, 8); + memcpy(tmp_buf + 8, mug_y, 4); + calc_crc = _book_crc32(tmp_buf, 12); + calc_crc = (c3_w)crc32(calc_crc, jam_y, len_rec - 4); + } + + if ( crc_w != calc_crc ) { + fprintf(stderr, "book: CRC mismatch at event %llu\r\n", cur_d); + c3_free(mug_y); + c3_free(jam_y); + return c3n; + } + + // reconstruct buffer in mug + jam format for callback + len_z = len_rec; + buf_y = c3_malloc(len_z); + memcpy(buf_y, mug_y, 4); + memcpy(buf_y + 4, jam_y, len_rec - 4); + + c3_free(mug_y); + c3_free(jam_y); + + // invoke callback + if ( c3n == read_f(ptr_v, cur_d, len_z, buf_y) ) { + c3_free(buf_y); + return c3n; + } + + c3_free(buf_y); + } + + return c3y; +} + +/* u3_book_walk_init(): initialize event iterator. +** +** sets up iterator to read events from [nex_d] to [las_d] inclusive. +*/ +c3_o +u3_book_walk_init(u3_book* log_u, + u3_book_walk* itr_u, + c3_d nex_d, + c3_d las_d) +{ + c3_w off_w; + c3_d cur_d; + + if ( !log_u || !itr_u ) { + return c3n; + } + + // validate range + if ( 0 == log_u->hed_u.las_d ) { + fprintf(stderr, "book: walk_init on empty log\r\n"); + return c3n; + } + + if ( nex_d < log_u->hed_u.fir_d || nex_d > log_u->hed_u.las_d ) { + fprintf(stderr, "book: walk_init start %llu out of range [%llu, %llu]\r\n", + nex_d, log_u->hed_u.fir_d, log_u->hed_u.las_d); + return c3n; + } + + if ( las_d < nex_d || las_d > log_u->hed_u.las_d ) { + fprintf(stderr, "book: walk_init end %llu out of range [%llu, %llu]\r\n", + las_d, nex_d, log_u->hed_u.las_d); + return c3n; + } + + // scan to starting event + off_w = sizeof(u3_book_head); + cur_d = log_u->hed_u.fir_d; + + while ( cur_d < nex_d ) { + c3_d skip_len; + c3_zs ret_zs; + + ret_zs = pread(log_u->fid_i, &skip_len, sizeof(c3_d), off_w); + if ( ret_zs != sizeof(c3_d) ) { + fprintf(stderr, "book: walk_init failed to scan to event %llu\r\n", nex_d); + return c3n; + } + + // skip entire record + off_w += 8 + 4 + (skip_len - 4) + 4 + 8; + cur_d++; + } + + // initialize iterator + itr_u->fid_i = log_u->fid_i; + itr_u->nex_d = nex_d; + itr_u->las_d = las_d; + itr_u->off_w = off_w; + itr_u->liv_o = c3y; + + return c3y; +} + +/* u3_book_walk_next(): read next event from iterator. +** +** allocates buffer for event (caller must free). +** returns c3n when no more events or error. +*/ +c3_o +u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) +{ + c3_d len_rec; + c3_y* mug_y; + c3_y* jam_y; + c3_w crc_w, calc_crc; + c3_d let_d; + c3_y* buf_y; + + if ( !itr_u || c3n == itr_u->liv_o ) { + return c3n; + } + + // check if we've reached the end + if ( itr_u->nex_d > itr_u->las_d ) { + itr_u->liv_o = c3n; + return c3n; + } + + // read record + if ( c3n == _book_read_record(itr_u->fid_i, &itr_u->off_w, + &len_rec, &mug_y, &jam_y, + &crc_w, &let_d) ) + { + fprintf(stderr, "book: walk_next failed to read event %llu\r\n", + itr_u->nex_d); + itr_u->liv_o = c3n; + return c3n; + } + + // validate len_d == let_d + if ( len_rec != let_d ) { + fprintf(stderr, "book: walk_next length mismatch at event %llu\r\n", + itr_u->nex_d); + c3_free(mug_y); + c3_free(jam_y); + itr_u->liv_o = c3n; + return c3n; + } + + // validate CRC + { + c3_y tmp_buf[12]; + memcpy(tmp_buf, &len_rec, 8); + memcpy(tmp_buf + 8, mug_y, 4); + calc_crc = _book_crc32(tmp_buf, 12); + calc_crc = (c3_w)crc32(calc_crc, jam_y, len_rec - 4); + } + + if ( crc_w != calc_crc ) { + fprintf(stderr, "book: walk_next CRC mismatch at event %llu\r\n", + itr_u->nex_d); + c3_free(mug_y); + c3_free(jam_y); + itr_u->liv_o = c3n; + return c3n; + } + + // reconstruct buffer in mug + jam format + *len_z = len_rec; + buf_y = c3_malloc(*len_z); + memcpy(buf_y, mug_y, 4); + memcpy(buf_y + 4, jam_y, len_rec - 4); + + c3_free(mug_y); + c3_free(jam_y); + + *buf_v = buf_y; + + // advance to next event + itr_u->nex_d++; + + return c3y; +} + +/* u3_book_walk_done(): close iterator. +*/ +void +u3_book_walk_done(u3_book_walk* itr_u) +{ + if ( !itr_u ) { + return; + } + + // mark iterator as invalid + itr_u->liv_o = c3n; + itr_u->fid_i = -1; +} + +/* u3_book_read_meta(): read metadata by string key from log. +** +** invokes callback with (ptr_v, len, data) or (ptr_v, -1, 0) if not found. +*/ +void +u3_book_read_meta(u3_book* log_u, + void* ptr_v, + const c3_c* key_c, + void (*read_f)(void*, c3_zs, void*)) +{ + c3_w key_len; + c3_y* meta_buf; + c3_w meta_len; + c3_zs ret_zs; + c3_w offset; + c3_w count; + c3_w i; + + if ( !log_u ) { + read_f(ptr_v, -1, 0); + return; + } + + // check if metadata section exists + if ( 0 == log_u->hed_u.len_w ) { + read_f(ptr_v, -1, 0); + return; + } + + // read entire metadata section + meta_len = log_u->hed_u.len_w; + meta_buf = c3_malloc(meta_len); + + ret_zs = pread(log_u->fid_i, meta_buf, meta_len, log_u->hed_u.off_w); + if ( ret_zs != (c3_zs)meta_len ) { + fprintf(stderr, "book: read_meta: failed to read metadata section\r\n"); + c3_free(meta_buf); + read_f(ptr_v, -1, 0); + return; + } + + // parse metadata section + // format: [4 bytes: count] + entries + // entry: [4 bytes: key_len][key][4 bytes: val_len][val] + + if ( meta_len < 4 ) { + fprintf(stderr, "book: read_meta: metadata section too small\r\n"); + c3_free(meta_buf); + read_f(ptr_v, -1, 0); + return; + } + + memcpy(&count, meta_buf, 4); + offset = 4; + + key_len = strlen(key_c); + + // linear search for key + for ( i = 0; i < count; i++ ) { + c3_w entry_key_len; + c3_y* entry_key; + c3_w entry_val_len; + c3_y* entry_val; + + // read key length + if ( offset + 4 > meta_len ) { + fprintf(stderr, "book: read_meta: corrupt metadata (key len)\r\n"); + c3_free(meta_buf); + read_f(ptr_v, -1, 0); + return; + } + memcpy(&entry_key_len, meta_buf + offset, 4); + offset += 4; + + // read key + if ( offset + entry_key_len > meta_len ) { + fprintf(stderr, "book: read_meta: corrupt metadata (key)\r\n"); + c3_free(meta_buf); + read_f(ptr_v, -1, 0); + return; + } + entry_key = meta_buf + offset; + offset += entry_key_len; + + // read value length + if ( offset + 4 > meta_len ) { + fprintf(stderr, "book: read_meta: corrupt metadata (val len)\r\n"); + c3_free(meta_buf); + read_f(ptr_v, -1, 0); + return; + } + memcpy(&entry_val_len, meta_buf + offset, 4); + offset += 4; + + // read value + if ( offset + entry_val_len > meta_len ) { + fprintf(stderr, "book: read_meta: corrupt metadata (val)\r\n"); + c3_free(meta_buf); + read_f(ptr_v, -1, 0); + return; + } + entry_val = meta_buf + offset; + offset += entry_val_len; + + // check if this is the key we're looking for + if ( entry_key_len == key_len && + 0 == memcmp(entry_key, key_c, key_len) ) + { + // found it - invoke callback + read_f(ptr_v, entry_val_len, entry_val); + c3_free(meta_buf); + return; + } + } + + // not found + c3_free(meta_buf); + read_f(ptr_v, -1, 0); +} + +/* u3_book_save_meta(): save metadata by string key into log. +** +** updates or inserts key-value pair in metadata section. +*/ +c3_o +u3_book_save_meta(u3_book* log_u, + const c3_c* key_c, + c3_z val_z, + void* val_p) +{ + c3_w key_len; + c3_y* old_meta = 0; + c3_w old_len = 0; + c3_w old_count = 0; + c3_y* new_meta; + c3_w new_len; + c3_w new_count; + c3_w offset; + c3_w i; + c3_o found = c3n; + c3_zs ret_zs; + + if ( !log_u ) { + return c3n; + } + + key_len = strlen(key_c); + + // read existing metadata if present + if ( 0 != log_u->hed_u.len_w ) { + old_len = log_u->hed_u.len_w; + old_meta = c3_malloc(old_len); + + ret_zs = pread(log_u->fid_i, old_meta, old_len, log_u->hed_u.off_w); + if ( ret_zs != (c3_zs)old_len ) { + fprintf(stderr, "book: save_meta: failed to read old metadata\r\n"); + c3_free(old_meta); + return c3n; + } + + if ( old_len < 4 ) { + fprintf(stderr, "book: save_meta: corrupt old metadata\r\n"); + c3_free(old_meta); + return c3n; + } + + memcpy(&old_count, old_meta, 4); + } + + // calculate new metadata size + // worst case: all old entries + new entry + new_len = 4; // count field + + // add existing entries (except if we're updating) + if ( old_meta ) { + offset = 4; + for ( i = 0; i < old_count; i++ ) { + c3_w entry_key_len, entry_val_len; + + if ( offset + 4 > old_len ) break; + memcpy(&entry_key_len, old_meta + offset, 4); + offset += 4; + + if ( offset + entry_key_len > old_len ) break; + + // check if this is the key we're updating + if ( entry_key_len == key_len && + 0 == memcmp(old_meta + offset, key_c, key_len) ) + { + found = c3y; + // skip old value, we'll add new one + offset += entry_key_len; + if ( offset + 4 > old_len ) break; + memcpy(&entry_val_len, old_meta + offset, 4); + offset += 4 + entry_val_len; + continue; + } + + // add this entry to new size + offset += entry_key_len; + if ( offset + 4 > old_len ) break; + memcpy(&entry_val_len, old_meta + offset, 4); + offset += 4; + + new_len += 4 + entry_key_len + 4 + entry_val_len; + offset += entry_val_len; + } + } + + // add new/updated entry + new_len += 4 + key_len + 4 + val_z; + + // allocate new metadata buffer + new_meta = c3_malloc(new_len); + + // write count + new_count = (c3y == found) ? old_count : old_count + 1; + memcpy(new_meta, &new_count, 4); + offset = 4; + + // copy existing entries (except updated one) + if ( old_meta ) { + c3_w old_offset = 4; + for ( i = 0; i < old_count; i++ ) { + c3_w entry_key_len, entry_val_len; + + if ( old_offset + 4 > old_len ) break; + memcpy(&entry_key_len, old_meta + old_offset, 4); + + if ( old_offset + 4 + entry_key_len > old_len ) break; + + // skip if this is the key we're updating + if ( entry_key_len == key_len && + 0 == memcmp(old_meta + old_offset + 4, key_c, key_len) ) + { + old_offset += 4 + entry_key_len; + if ( old_offset + 4 > old_len ) break; + memcpy(&entry_val_len, old_meta + old_offset, 4); + old_offset += 4 + entry_val_len; + continue; + } + + // copy this entry + memcpy(new_meta + offset, old_meta + old_offset, 4); + offset += 4; + old_offset += 4; + + memcpy(new_meta + offset, old_meta + old_offset, entry_key_len); + offset += entry_key_len; + old_offset += entry_key_len; + + if ( old_offset + 4 > old_len ) break; + memcpy(&entry_val_len, old_meta + old_offset, 4); + memcpy(new_meta + offset, &entry_val_len, 4); + offset += 4; + old_offset += 4; + + memcpy(new_meta + offset, old_meta + old_offset, entry_val_len); + offset += entry_val_len; + old_offset += entry_val_len; + } + } + + // add new/updated entry + memcpy(new_meta + offset, &key_len, 4); + offset += 4; + memcpy(new_meta + offset, key_c, key_len); + offset += key_len; + { + c3_w val_len_w = (c3_w)val_z; // convert c3_z to c3_w for 4-byte field + memcpy(new_meta + offset, &val_len_w, 4); + } + offset += 4; + memcpy(new_meta + offset, val_p, val_z); + offset += val_z; + + // write new metadata section at end of file + c3_w new_off = log_u->off_w; + + ret_zs = pwrite(log_u->fid_i, new_meta, new_len, new_off); + if ( ret_zs != (c3_zs)new_len ) { + fprintf(stderr, "book: save_meta: failed to write metadata: %s\r\n", + strerror(errno)); + c3_free(new_meta); + if ( old_meta ) c3_free(old_meta); + return c3n; + } + + c3_free(new_meta); + if ( old_meta ) c3_free(old_meta); + + // sync metadata + if ( -1 == c3_sync(log_u->fid_i) ) { + fprintf(stderr, "book: save_meta: failed to sync metadata: %s\r\n", + strerror(errno)); + return c3n; + } + + // update header + log_u->hed_u.off_w = new_off; + log_u->hed_u.len_w = new_len; + log_u->dit_o = c3y; + + // write and sync header + if ( c3n == _book_write_header(log_u) ) { + return c3n; + } + + // update append offset (metadata is now at end) + log_u->off_w = new_off + new_len; + + return c3y; +} diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h new file mode 100644 index 0000000000..144da0adf1 --- /dev/null +++ b/pkg/vere/db/book.h @@ -0,0 +1,114 @@ +/// @file + +#ifndef U3_VERE_DB_BOOK_H +#define U3_VERE_DB_BOOK_H + +#include "c3/c3.h" + + /* book: append-only event log + */ + /* u3_book_head: on-disk file header (64 bytes) + */ + typedef struct _u3_book_head { + c3_w mag_w; // magic number: 0x424f4f4b ("BOOK") + c3_w ver_w; // format version: 1 + c3_d fir_d; // first event number in file + c3_d las_d; // last event number in file + c3_w off_w; // offset to metadata section + c3_w len_w; // length of metadata section + c3_y pad_y[32]; // reserved for future use, zeroed + } u3_book_head; + + /* u3_book: event log handle + */ + typedef struct _u3_book { + c3_i fid_i; // file descriptor + c3_c* pax_c; // file path + u3_book_head hed_u; // cached header + c3_w off_w; // append offset (end of last event) + c3_o dit_o; // header needs sync + } u3_book; + + /* u3_book_walk: event iterator + */ + typedef struct _u3_book_walk { + c3_i fid_i; // file descriptor + c3_d nex_d; // next event number to read + c3_d las_d; // last event number, inclusive + c3_w off_w; // current file offset + c3_o liv_o; // iterator valid + } u3_book_walk; + + /* u3_book_init(): open/create event log at [pax_c]. + */ + u3_book* + u3_book_init(const c3_c* pax_c); + + /* u3_book_exit(): close event log. + */ + void + u3_book_exit(u3_book* log_u); + + /* u3_book_stat(): print book stats. + */ + void + u3_book_stat(const c3_c* pax_c); + + /* u3_book_gulf(): read first and last event numbers. + */ + c3_o + u3_book_gulf(u3_book* log_u, c3_d* low_d, c3_d* hig_d); + + /* u3_book_read(): read [len_d] events starting at [eve_d]. + */ + c3_o + u3_book_read(u3_book* log_u, + void* ptr_v, + c3_d eve_d, + c3_d len_d, + c3_o (*read_f)(void*, c3_d, c3_z, void*)); + + /* u3_book_save(): save [len_d] events starting at [eve_d]. + */ + c3_o + u3_book_save(u3_book* log_u, + c3_d eve_d, + c3_d len_d, + void** byt_p, + c3_z* siz_i); + + /* u3_book_read_meta(): read metadata by string key from log. + */ + void + u3_book_read_meta(u3_book* log_u, + void* ptr_v, + const c3_c* key_c, + void (*read_f)(void*, c3_zs, void*)); + + /* u3_book_save_meta(): save metadata by string key into log. + */ + c3_o + u3_book_save_meta(u3_book* log_u, + const c3_c* key_c, + c3_z val_z, + void* val_p); + + /* u3_book_walk_init(): initialize event iterator. + */ + c3_o + u3_book_walk_init(u3_book* log_u, + u3_book_walk* itr_u, + c3_d nex_d, + c3_d las_d); + + /* u3_book_walk_next(): read next event from iterator. + */ + c3_o + u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v); + + /* u3_book_walk_done(): close iterator. + */ + void + u3_book_walk_done(u3_book_walk* itr_u); + +#endif /* ifndef U3_VERE_DB_BOOK_H */ diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index 4044fdbade..cf4eb7c985 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -10,6 +10,7 @@ #include "c3/c3.h" #include "db/lmdb.h" +#include "db/book.h" #include "noun.h" #include "uv.h" #include From 8ae7a7e853f084fab3c3708f301ecd6c0a182bd8 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Wed, 31 Dec 2025 18:14:37 -0500 Subject: [PATCH 02/38] disk: replaces lmdb with book --- pkg/vere/book_tests.c | 16 +++---- pkg/vere/db/book.c | 99 ++++++++++++++++++++++++++++--------------- pkg/vere/db/book.h | 3 +- pkg/vere/disk.c | 80 +++++++++++++++++----------------- pkg/vere/main.c | 2 +- pkg/vere/vere.h | 6 +-- 6 files changed, 121 insertions(+), 85 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index 2e5a0d1048..af31c0b275 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -205,7 +205,7 @@ _test_book_single_event(void) // create and save event 1 _test_make_event(&buf_y, &siz_z, 1); - if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z) ) { + if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { fprintf(stderr, "book_tests: save failed\r\n"); c3_free(buf_y); u3_book_exit(log_u); @@ -283,7 +283,7 @@ _test_book_batch_write(void) } // write batch - if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes) ) { + if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes, 0) ) { fprintf(stderr, "book_tests: batch save failed\r\n"); for ( i = 0; i < 100; i++ ) { c3_free(bufs[i]); @@ -365,7 +365,7 @@ _test_book_persistence(void) _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); } - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes) ) { + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { fprintf(stderr, "book_tests: persist save failed\r\n"); for ( i = 0; i < 50; i++ ) { c3_free(bufs[i]); @@ -450,7 +450,7 @@ _test_book_contiguity(void) // write event 1 _test_make_event(&buf_y, &siz_z, 1); - if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z) ) { + if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { fprintf(stderr, "book_tests: contig save 1 failed\r\n"); c3_free(buf_y); u3_book_exit(log_u); @@ -462,7 +462,7 @@ _test_book_contiguity(void) // try to write event 3 (should fail - gap) _test_make_event(&buf_y, &siz_z, 3); - if ( c3y == u3_book_save(log_u, 3, 1, (void**)&buf_y, &siz_z) ) { + if ( c3y == u3_book_save(log_u, 3, 1, (void**)&buf_y, &siz_z, 0) ) { fprintf(stderr, "book_tests: contig should have failed for gap\r\n"); c3_free(buf_y); u3_book_exit(log_u); @@ -474,7 +474,7 @@ _test_book_contiguity(void) // write event 2 (should succeed) _test_make_event(&buf_y, &siz_z, 2); - if ( c3n == u3_book_save(log_u, 2, 1, (void**)&buf_y, &siz_z) ) { + if ( c3n == u3_book_save(log_u, 2, 1, (void**)&buf_y, &siz_z, 0) ) { fprintf(stderr, "book_tests: contig save 2 failed\r\n"); c3_free(buf_y); u3_book_exit(log_u); @@ -518,7 +518,7 @@ _test_book_partial_read(void) _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); } - if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes) ) { + if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes, 0) ) { fprintf(stderr, "book_tests: partial save failed\r\n"); for ( i = 0; i < 100; i++ ) { c3_free(bufs[i]); @@ -591,7 +591,7 @@ _test_book_iterator(void) _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); } - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes) ) { + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { fprintf(stderr, "book_tests: iter save failed\r\n"); for ( i = 0; i < 50; i++ ) { c3_free(bufs[i]); diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 4b10a21eb9..b3f376ea75 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -15,19 +15,20 @@ // book: append-only event log // -// simple persistence layer replacing LMDB for event storage. +// simple file-based persistence layer for urbit's event log. // optimized for sequential writes and reads, no random access. // // file format: // [64-byte header] -// [events: len_d | mug_l | jam_data | crc_m | let_d] // [metadata section] +// [events: len_d | mug_l | jam_data | crc_m | let_d] // /* constants */ - #define BOOK_MAGIC 0x424f4f4b // "BOOK" - #define BOOK_VERSION 1 // format version + #define BOOK_MAGIC 0x424f4f4b // "BOOK" + #define BOOK_VERSION 1 // format version + #define BOOK_META_SIZE 256 // reserved metadata area size /* _book_crc32(): compute CRC32 checksum. */ @@ -203,7 +204,7 @@ _book_read_record(c3_i fid_i, static c3_w _book_scan_end(u3_book* log_u) { - c3_w off_w = sizeof(u3_book_head); + c3_w off_w = sizeof(u3_book_head) + BOOK_META_SIZE; // events start here c3_d count_d = 0; c3_d expected_d; @@ -297,8 +298,8 @@ u3_book_init(const c3_c* pax_c) struct stat buf_u; u3_book* log_u; - // construct path to event.log - snprintf(path_c, sizeof(path_c), "%s/event.log", pax_c); + // construct path to book.log + snprintf(path_c, sizeof(path_c), "%s/book.log", pax_c); // open or create file fid_i = c3_open(path_c, O_RDWR | O_CREAT, 0644); @@ -325,7 +326,8 @@ u3_book_init(const c3_c* pax_c) // new file: initialize header _book_init_header(log_u); _book_write_header(log_u); - log_u->off_w = sizeof(u3_book_head); + // events start after header + reserved metadata area + log_u->off_w = sizeof(u3_book_head) + BOOK_META_SIZE; } else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { // corrupt file: too small @@ -394,33 +396,55 @@ u3_book_gulf(u3_book* log_u, c3_d* low_d, c3_d* hig_d) void u3_book_stat(const c3_c* pax_c) { - u3_book* log_u = u3_book_init(pax_c); + c3_i fid_i; + u3_book_head hed_u; struct stat buf_u; - if ( !log_u ) { - fprintf(stderr, "book: failed to open for stats\r\n"); + // open the file directly + fid_i = c3_open(pax_c, O_RDONLY, 0); + if ( fid_i < 0 ) { + fprintf(stderr, "book: failed to open %s: %s\r\n", pax_c, strerror(errno)); + return; + } + + // read and validate header + if ( sizeof(u3_book_head) != read(fid_i, &hed_u, sizeof(u3_book_head)) ) { + fprintf(stderr, "book: failed to read header\r\n"); + close(fid_i); return; } - if ( fstat(log_u->fid_i, &buf_u) < 0 ) { + if ( BOOK_MAGIC != hed_u.mag_w ) { + fprintf(stderr, "book: invalid magic number: 0x%x\r\n", hed_u.mag_w); + close(fid_i); + return; + } + + if ( BOOK_VERSION != hed_u.ver_w ) { + fprintf(stderr, "book: unsupported version: %u\r\n", hed_u.ver_w); + close(fid_i); + return; + } + + if ( fstat(fid_i, &buf_u) < 0 ) { fprintf(stderr, "book: fstat failed\r\n"); - u3_book_exit(log_u); + close(fid_i); return; } fprintf(stderr, "book info:\r\n"); - fprintf(stderr, " file: %s\r\n", log_u->pax_c); - fprintf(stderr, " version: %u\r\n", log_u->hed_u.ver_w); - fprintf(stderr, " first event: %llu\r\n", log_u->hed_u.fir_d); - fprintf(stderr, " last event: %llu\r\n", log_u->hed_u.las_d); + fprintf(stderr, " file: %s\r\n", pax_c); + fprintf(stderr, " version: %u\r\n", hed_u.ver_w); + fprintf(stderr, " first event: %llu\r\n", hed_u.fir_d); + fprintf(stderr, " last event: %llu\r\n", hed_u.las_d); fprintf(stderr, " event count: %llu\r\n", - (0 == log_u->hed_u.las_d ) ? 0 : - (log_u->hed_u.las_d - log_u->hed_u.fir_d + 1)); + (0 == hed_u.las_d ) ? 0 : + (hed_u.las_d - hed_u.fir_d + 1)); fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); - fprintf(stderr, " metadata offset: %u\r\n", log_u->hed_u.off_w); - fprintf(stderr, " metadata length: %u\r\n", log_u->hed_u.len_w); + fprintf(stderr, " metadata offset: %u\r\n", hed_u.off_w); + fprintf(stderr, " metadata length: %u\r\n", hed_u.len_w); - u3_book_exit(log_u); + close(fid_i); } /* u3_book_save(): save [len_d] events starting at [eve_d]. @@ -433,7 +457,8 @@ u3_book_save(u3_book* log_u, c3_d eve_d, c3_d len_d, void** byt_p, - c3_z* siz_i) + c3_z* siz_i, + c3_d epo_d) { c3_w i; c3_w off_now; @@ -444,8 +469,8 @@ u3_book_save(u3_book* log_u, // validate contiguity if ( 0 == log_u->hed_u.las_d ) { - // empty log: first event must be 1 - if ( 1 != eve_d ) { + // empty log: first event must be the first event in the epoch + if ( epo_d + 1 != eve_d ) { fprintf(stderr, "book: first event must be 1, got %llu\r\n", eve_d); return c3n; } @@ -606,8 +631,8 @@ u3_book_read(u3_book* log_u, return c3n; } - // scan to starting event - off_w = sizeof(u3_book_head); + // scan to starting event (events start after header + metadata area) + off_w = sizeof(u3_book_head) + BOOK_META_SIZE; cur_d = log_u->hed_u.fir_d; while ( cur_d < eve_d ) { @@ -724,8 +749,8 @@ u3_book_walk_init(u3_book* log_u, return c3n; } - // scan to starting event - off_w = sizeof(u3_book_head); + // scan to starting event (events start after header + metadata area) + off_w = sizeof(u3_book_head) + BOOK_META_SIZE; cur_d = log_u->hed_u.fir_d; while ( cur_d < nex_d ) { @@ -1124,8 +1149,17 @@ u3_book_save_meta(u3_book* log_u, memcpy(new_meta + offset, val_p, val_z); offset += val_z; - // write new metadata section at end of file - c3_w new_off = log_u->off_w; + // write new metadata section in reserved area after header + c3_w new_off = sizeof(u3_book_head); + + // ensure metadata fits in reserved space + if ( new_len > BOOK_META_SIZE ) { + fprintf(stderr, "book: save_meta: metadata too large (%u > %u)\r\n", + new_len, BOOK_META_SIZE); + c3_free(new_meta); + if ( old_meta ) c3_free(old_meta); + return c3n; + } ret_zs = pwrite(log_u->fid_i, new_meta, new_len, new_off); if ( ret_zs != (c3_zs)new_len ) { @@ -1156,8 +1190,7 @@ u3_book_save_meta(u3_book* log_u, return c3n; } - // update append offset (metadata is now at end) - log_u->off_w = new_off + new_len; + // off_w is not affected by metadata writes - events append at off_w return c3y; } diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index 144da0adf1..b08ffe9e57 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -75,7 +75,8 @@ c3_d eve_d, c3_d len_d, void** byt_p, - c3_z* siz_i); + c3_z* siz_i, + c3_d epo_d); /* u3_book_read_meta(): read metadata by string key from log. */ diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index 19a03b5372..ec6c0dc0d8 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -4,14 +4,14 @@ #include "events.h" #include "vere.h" #include "version.h" -#include "db/lmdb.h" +#include "db/book.h" #include #include "migrate.h" #include "v4.h" struct _u3_disk_walk { - u3_lmdb_walk itr_u; + u3_book_walk itr_u; u3_disk* log_u; c3_o liv_o; }; @@ -92,11 +92,12 @@ _disk_commit_cb(uv_work_t* ted_u) { u3_disk* log_u = ted_u->data; - log_u->sav_u.ret_o = u3_lmdb_save(log_u->mdb_u, + log_u->sav_u.ret_o = u3_book_save(log_u->mdb_u, log_u->sav_u.eve_d, log_u->sav_u.len_w, (void**)log_u->sav_u.byt_y, - log_u->sav_u.siz_i); + log_u->sav_u.siz_i, + log_u->epo_d); } /* _disk_commit_start(): queue async event-batch write. @@ -273,11 +274,12 @@ u3_disk_sync(u3_disk* log_u) // XX max 100 // if ( c3y == _disk_batch(log_u) ) { - ret_o = u3_lmdb_save(log_u->mdb_u, + ret_o = u3_book_save(log_u->mdb_u, log_u->sav_u.eve_d, log_u->sav_u.len_w, (void**)log_u->sav_u.byt_y, - log_u->sav_u.siz_i); + log_u->sav_u.siz_i, + log_u->epo_d); log_u->sav_u.ret_o = ret_o; @@ -373,7 +375,7 @@ u3_disk_read_list(u3_disk* log_u, c3_d eve_d, c3_d len_d, c3_l* mug_l) { struct _cd_list ven_u = { log_u, u3_nul, 0 }; - if ( c3n == u3_lmdb_read(log_u->mdb_u, &ven_u, + if ( c3n == u3_book_read(log_u->mdb_u, &ven_u, eve_d, len_d, _disk_read_list_cb) ) { // XX test normal (not subcommand) replay with and without, @@ -397,7 +399,7 @@ u3_disk_walk_init(u3_disk* log_u, c3_d max_d = eve_d + len_d - 1; wok_u->log_u = log_u; - wok_u->liv_o = u3_lmdb_walk_init(log_u->mdb_u, + wok_u->liv_o = u3_book_walk_init(log_u->mdb_u, &wok_u->itr_u, eve_d, c3_min(max_d, log_u->dun_d)); @@ -433,7 +435,7 @@ u3_disk_walk_step(u3_disk_walk* wok_u, u3_fact* tac_u) tac_u->eve_d = wok_u->itr_u.nex_d; - if ( c3n == u3_lmdb_walk_next(&wok_u->itr_u, &len_i, &buf_v) ) { + if ( c3n == u3_book_walk_next(&wok_u->itr_u, &len_i, &buf_v) ) { fprintf(stderr, "disk: (%" PRIu64 "): read fail\r\n", tac_u->eve_d); return wok_u->liv_o = c3n; } @@ -455,14 +457,14 @@ u3_disk_walk_step(u3_disk_walk* wok_u, u3_fact* tac_u) void u3_disk_walk_done(u3_disk_walk* wok_u) { - u3_lmdb_walk_done(&wok_u->itr_u); + u3_book_walk_done(&wok_u->itr_u); c3_free(wok_u); } /* _disk_save_meta(): serialize atom, save as metadata at [key_c]. */ static c3_o -_disk_save_meta(MDB_env* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) +_disk_save_meta(u3_book* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) { // strip trailing zeroes. // @@ -470,13 +472,13 @@ _disk_save_meta(MDB_env* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) len_w--; } - return u3_lmdb_save_meta(mdb_u, key_c, len_w, byt_y); + return u3_book_save_meta(mdb_u, key_c, len_w, byt_y); } /* u3_disk_save_meta(): save metadata. */ c3_o -u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) +u3_disk_save_meta(u3_book* mdb_u, const u3_meta* met_u) { u3_assert( c3y == u3a_is_cat(met_u->lif_w) ); @@ -501,10 +503,10 @@ u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) c3_o u3_disk_save_meta_meta(c3_c* log_c, const u3_meta* met_u) { - MDB_env* dbm_u; + u3_book* dbm_u; - if ( 0 == (dbm_u = u3_lmdb_init(log_c, u3_Host.ops_u.siz_i)) ) { - fprintf(stderr, "disk: failed to initialize meta-lmdb\r\n"); + if ( 0 == (dbm_u = u3_book_init(log_c)) ) { + fprintf(stderr, "disk: failed to initialize meta-book\r\n"); return c3n; } @@ -513,7 +515,7 @@ u3_disk_save_meta_meta(c3_c* log_c, const u3_meta* met_u) return c3n; } - u3_lmdb_exit(dbm_u); + u3_book_exit(dbm_u); return c3y; } @@ -544,7 +546,7 @@ _disk_meta_read_cb(void* ptr_v, ssize_t val_i, void* val_v) /* u3_disk_read_meta(): read metadata. */ c3_o -u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) +u3_disk_read_meta(u3_book* mdb_u, u3_meta* met_u) { c3_w ver_w, lif_w; c3_d who_d[2]; @@ -554,13 +556,13 @@ u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) // version // - u3_lmdb_read_meta(mdb_u, &val_u, "version", _disk_meta_read_cb); + u3_book_read_meta(mdb_u, &val_u, "version", _disk_meta_read_cb); ver_w = val_u.buf_y[0]; // identity // - u3_lmdb_read_meta(mdb_u, &val_u, "who", _disk_meta_read_cb); + u3_book_read_meta(mdb_u, &val_u, "who", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no identity\r\n"); @@ -595,7 +597,7 @@ u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) // fake bit // - u3_lmdb_read_meta(mdb_u, &val_u, "fake", _disk_meta_read_cb); + u3_book_read_meta(mdb_u, &val_u, "fake", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no fake bit\r\n"); @@ -615,7 +617,7 @@ u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) // life // - u3_lmdb_read_meta(mdb_u, &val_u, "life", _disk_meta_read_cb); + u3_book_read_meta(mdb_u, &val_u, "life", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no lifecycle length\r\n"); @@ -829,7 +831,7 @@ u3_disk_exit(u3_disk* log_u) // close database // - u3_lmdb_exit(log_u->mdb_u); + u3_book_exit(log_u->mdb_u); // dispose planned writes // @@ -1153,11 +1155,11 @@ _disk_epoc_roll(u3_disk* log_u, c3_d epo_d) fprintf(stderr, "disk: failed to read metadata\r\n"); goto fail3; } - u3_lmdb_exit(log_u->mdb_u); + u3_book_exit(log_u->mdb_u); log_u->mdb_u = 0; // initialize db of new epoch - if ( 0 == (log_u->mdb_u = u3_lmdb_init(epo_c, u3_Host.ops_u.siz_i)) ) { + if ( 0 == (log_u->mdb_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database\r\n"); c3_free(log_u); goto fail3; @@ -1350,7 +1352,7 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) } // finish with old log - u3_lmdb_exit(log_u->mdb_u); + u3_book_exit(log_u->mdb_u); log_u->mdb_u = 0; // check if lock.mdb is readable in log directory @@ -1418,7 +1420,7 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) return c3n; } - if ( 0 == (log_u->mdb_u = u3_lmdb_init(tmp_c, u3_Host.ops_u.siz_i)) ) { + if ( 0 == (log_u->mdb_u = u3_book_init(tmp_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", tmp_c); return c3n; @@ -1432,7 +1434,7 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) // atomic truncation of old log // - u3_lmdb_exit(log_u->mdb_u); + u3_book_exit(log_u->mdb_u); log_u->mdb_u = 0; c3_c trd_c[8193]; @@ -1453,7 +1455,7 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) strerror(errno)); } - if ( 0 == (log_u->mdb_u = u3_lmdb_init(epo_c, u3_Host.ops_u.siz_i)) ) { + if ( 0 == (log_u->mdb_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", epo_c); return c3n; @@ -1529,7 +1531,7 @@ u3_disk_roll(u3_disk* log_u, c3_d eve_d) // XX get fir_d from log_u c3_d fir_d, las_d; - if ( c3n == u3_lmdb_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->mdb_u, &fir_d, &las_d) ) { fprintf(stderr, "roll: failed to read first/last event numbers\r\n"); exit(1); } @@ -1679,7 +1681,7 @@ static void _disk_migrate_old(u3_disk* log_u) { c3_d fir_d, las_d; - if ( c3n == u3_lmdb_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->mdb_u, &fir_d, &las_d) ) { fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); exit(1); } @@ -1769,7 +1771,7 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) snprintf(epo_c, 8192, "%s/0i%" PRIc3_d, log_u->com_u->pax_c, lat_d); // initialize latest epoch's db - if ( 0 == (log_u->mdb_u = u3_lmdb_init(epo_c, u3_Host.ops_u.siz_i)) ) { + if ( 0 == (log_u->mdb_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", epo_c); return _epoc_fail; @@ -1777,11 +1779,11 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) fprintf(stderr, "disk: loaded epoch 0i%" PRIc3_d "\r\n", lat_d); - // get first/last event numbers from lmdb + // get first/last event numbers from book c3_d fir_d, las_d; - if ( c3n == u3_lmdb_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->mdb_u, &fir_d, &las_d) ) { fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); - u3_lmdb_exit(log_u->mdb_u); + u3_book_exit(log_u->mdb_u); log_u->mdb_u = 0; return _epoc_fail; } @@ -1791,7 +1793,7 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) && !las_d && (c3n == u3_disk_read_meta(log_u->mdb_u, 0)) ) { - u3_lmdb_exit(log_u->mdb_u); + u3_book_exit(log_u->mdb_u); log_u->mdb_u = 0; return _epoc_void; } @@ -2043,7 +2045,7 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) // { u3_meta met_u; - if ( (0 == (log_u->mdb_u = u3_lmdb_init(log_c, u3_Host.ops_u.siz_i))) + if ( (0 == (log_u->mdb_u = u3_book_init(log_c))) || (c3n == u3_disk_read_meta(log_u->mdb_u, &met_u)) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); @@ -2068,8 +2070,8 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) return log_u; } - // close top-level lmdb - u3_lmdb_exit(log_u->mdb_u); + // close top-level book + u3_book_exit(log_u->mdb_u); log_u->mdb_u = 0; // get latest epoch number diff --git a/pkg/vere/main.c b/pkg/vere/main.c index a41fb7defc..e1b3512172 100644 --- a/pkg/vere/main.c +++ b/pkg/vere/main.c @@ -1506,7 +1506,7 @@ _cw_info(c3_i argc, c3_c* argv[]) fprintf(stderr, "\r\n"); } - u3_lmdb_stat(log_u->mdb_u, stdout); + u3_book_stat(log_u->mdb_u->pax_c); u3_disk_exit(log_u); u3m_stop(); diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index cf4eb7c985..a793e9aee4 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -533,7 +533,7 @@ c3_i lok_i; // lockfile c3_o liv_o; // live c3_w ver_w; // version (see version.h) - void* mdb_u; // lmdb env of current epoch + u3_book* mdb_u; // book env of current epoch c3_d sen_d; // commit requested c3_d dun_d; // committed c3_d epo_d; // current epoch number @@ -878,12 +878,12 @@ /* u3_disk_read_meta(): read metadata. */ c3_o - u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u); + u3_disk_read_meta(u3_book* mdb_u, u3_meta* met_u); /* u3_disk_save_meta(): save metadata. */ c3_o - u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u); + u3_disk_save_meta(u3_book* mdb_u, const u3_meta* met_u); /* u3_disk_save_meta_meta(): save meta metadata. */ From 5e7f368a90b9b62d08acc69f307dffb2d799b75a Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Thu, 1 Jan 2026 17:29:08 -0500 Subject: [PATCH 03/38] book: formalizes in-memory and on-disk event structures as `reed` and `deed` --- pkg/vere/db/book.c | 730 +++++++++++++++++++++------------------------ pkg/vere/db/book.h | 48 ++- 2 files changed, 383 insertions(+), 395 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index b3f376ea75..11c582f46b 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -47,14 +47,14 @@ _book_crc32_two(c3_y* one_y, c3_w one_w, c3_y* two_y, c3_w two_w) return (c3_w)crc32(crc_w, two_y, two_w); } -/* _book_write_header(): write header to file at offset 0. +/* _book_save_head(): write header to file at offset 0. */ static c3_o -_book_write_header(u3_book* log_u) +_book_save_head(u3_book* txt_u) { c3_zs ret_zs; - ret_zs = pwrite(log_u->fid_i, &log_u->hed_u, + ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, sizeof(u3_book_head), 0); if ( ret_zs != sizeof(u3_book_head) ) { @@ -63,24 +63,24 @@ _book_write_header(u3_book* log_u) return c3n; } - if ( -1 == c3_sync(log_u->fid_i) ) { + if ( -1 == c3_sync(txt_u->fid_i) ) { fprintf(stderr, "book: failed to sync header: %s\r\n", strerror(errno)); return c3n; } - log_u->dit_o = c3n; + txt_u->dit_o = c3n; return c3y; } -/* _book_read_header(): read and validate header. +/* _book_read_head(): read and validate header. */ static c3_o -_book_read_header(u3_book* log_u) +_book_read_head(u3_book* txt_u) { c3_zs ret_zs; - ret_zs = pread(log_u->fid_i, &log_u->hed_u, + ret_zs = pread(txt_u->fid_i, &txt_u->hed_u, sizeof(u3_book_head), 0); if ( ret_zs != sizeof(u3_book_head) ) { @@ -88,109 +88,212 @@ _book_read_header(u3_book* log_u) return c3n; } - if ( BOOK_MAGIC != log_u->hed_u.mag_w ) { + if ( BOOK_MAGIC != txt_u->hed_u.mag_w ) { fprintf(stderr, "book: invalid magic: 0x%08x\r\n", - log_u->hed_u.mag_w); + txt_u->hed_u.mag_w); return c3n; } - if ( BOOK_VERSION != log_u->hed_u.ver_w ) { + if ( BOOK_VERSION != txt_u->hed_u.ver_w ) { fprintf(stderr, "book: unsupported version: %u\r\n", - log_u->hed_u.ver_w); + txt_u->hed_u.ver_w); return c3n; } return c3y; } -/* _book_init_header(): initialize header for new file. +/* _book_init_head(): initialize header for new file. */ static void -_book_init_header(u3_book* log_u) +_book_init_head(u3_book* txt_u) { - memset(&log_u->hed_u, 0, sizeof(u3_book_head)); - log_u->hed_u.mag_w = BOOK_MAGIC; - log_u->hed_u.ver_w = BOOK_VERSION; - log_u->hed_u.fir_d = 0; - log_u->hed_u.las_d = 0; - log_u->hed_u.off_w = 0; - log_u->hed_u.len_w = 0; - log_u->dit_o = c3y; + memset(&txt_u->hed_u, 0, sizeof(u3_book_head)); + txt_u->hed_u.mag_w = BOOK_MAGIC; + txt_u->hed_u.ver_w = BOOK_VERSION; + txt_u->hed_u.fir_d = 0; + txt_u->hed_u.las_d = 0; + txt_u->hed_u.off_w = 0; + txt_u->hed_u.len_w = 0; + txt_u->dit_o = c3y; } -/* _book_read_record(): read event record at offset. +/* _book_deed_size(): calculate total on-disk size of deed. +*/ +static inline c3_w +_book_deed_size(c3_d len_d) +{ + return sizeof(u3_book_deed_head) + (len_d - 4) + sizeof(u3_book_deed_tail); + // = 12 + (len_d - 4) + 12 = len_d + 20 +} + +/* _book_calc_crc(): compute CRC32 for reed. +*/ +static c3_w +_book_calc_crc(const u3_book_reed* red_u) +{ + c3_y buf_y[12]; // 8 bytes len_d + 4 bytes mug + memcpy(buf_y, &red_u->len_d, 8); + memcpy(buf_y + 8, &red_u->mug_l, 4); + + return _book_crc32_two(buf_y, 12, red_u->jam_y, red_u->len_d - 4); +} + +/* _book_okay_reed(): validate reed integrity. +*/ +static c3_o +_book_okay_reed(const u3_book_reed* red_u) +{ + // validate length + if ( 0 == red_u->len_d || (1ULL << 32) < red_u->len_d ) { + return c3n; + } + + // validate CRC + c3_w crc_w = _book_calc_crc(red_u); + if ( crc_w != red_u->crc_w ) { + return c3n; + } + + return c3y; +} + +/* _book_read_deed(): read deed from file into [red_u]. ** ** returns: -** c3y: success, buffers allocated +** c3y: success, jam_y allocated ** c3n: failure (EOF or corruption) ** -** on success, caller must free *mug_y and *jam_y +** on success, caller must free red_u->jam_y */ static c3_o -_book_read_record(c3_i fid_i, - c3_w* off_w, - c3_d* len_d, - c3_y** mug_y, - c3_y** jam_y, - c3_w* crc_w, - c3_d* let_d) +_book_read_deed(c3_i fid_i, c3_w* off_w, u3_book_reed* red_u) { c3_zs ret_zs; - c3_w off_now = *off_w; + c3_w now_w = *off_w; + c3_d let_d; - // read len_d (8 bytes) - ret_zs = pread(fid_i, len_d, sizeof(c3_d), off_now); - if ( ret_zs != sizeof(c3_d) ) { + // read deed_head + u3_book_deed_head hed_u; + ret_zs = pread(fid_i, &hed_u, sizeof(u3_book_deed_head), now_w); + if ( ret_zs != sizeof(u3_book_deed_head) ) { return c3n; } - off_now += sizeof(c3_d); + now_w += sizeof(u3_book_deed_head); // validate length - if ( 0 == *len_d || (1ULL << 32) < *len_d ) { - fprintf(stderr, "book: invalid length: %llu\r\n", *len_d); + if ( 0 == hed_u.len_d || (1ULL << 32) < hed_u.len_d ) { + fprintf(stderr, "book: invalid length: %llu\r\n", hed_u.len_d); return c3n; } - // read mug_l (4 bytes) - *mug_y = c3_malloc(4); - ret_zs = pread(fid_i, *mug_y, 4, off_now); - if ( ret_zs != 4 ) { - c3_free(*mug_y); + // populate reed from head + red_u->len_d = hed_u.len_d; + red_u->mug_l = hed_u.mug_l; + + // read jam data (len_d - mug bytes) + c3_d jaz_d = red_u->len_d - 4; + red_u->jam_y = c3_malloc(jaz_d); + ret_zs = pread(fid_i, red_u->jam_y, jaz_d, now_w); + if ( ret_zs != (c3_zs)jaz_d ) { + c3_free(red_u->jam_y); return c3n; } - off_now += 4; + now_w += jaz_d; - // read jam data (len_d - 4 bytes, since len_d includes mug) - c3_d jam_len = *len_d - 4; - *jam_y = c3_malloc(jam_len); - ret_zs = pread(fid_i, *jam_y, jam_len, off_now); - if ( ret_zs != (c3_zs)jam_len ) { - c3_free(*mug_y); - c3_free(*jam_y); + // read deed_tail + u3_book_deed_tail tal_u; + ret_zs = pread(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_w); + if ( ret_zs != sizeof(u3_book_deed_tail) ) { + c3_free(red_u->jam_y); return c3n; } - off_now += jam_len; + now_w += sizeof(u3_book_deed_tail); + + // populate reed from tail + red_u->crc_w = tal_u.crc_w; + let_d = tal_u.let_d; - // read crc_m (4 bytes) - ret_zs = pread(fid_i, crc_w, sizeof(c3_w), off_now); - if ( ret_zs != sizeof(c3_w) ) { - c3_free(*mug_y); - c3_free(*jam_y); + // validate len_d == let_d + if ( red_u->len_d != let_d ) { + c3_free(red_u->jam_y); return c3n; } - off_now += sizeof(c3_w); - // read let_d (8 bytes) - ret_zs = pread(fid_i, let_d, sizeof(c3_d), off_now); - if ( ret_zs != sizeof(c3_d) ) { - c3_free(*mug_y); - c3_free(*jam_y); + // update offset + *off_w = now_w; + + return c3y; +} + +/* _book_save_deed(): save complete deed to file. +** +** returns: +** c3y: success +** c3n: failure +*/ +static c3_o +_book_save_deed(c3_i fid_i, c3_w* off_w, const u3_book_reed* red_u) +{ + c3_zs ret_zs; + c3_w now_w = *off_w; + c3_d jaz_d = red_u->len_d - 4; // len_d - mug bytes + + // write deed_head + u3_book_deed_head hed_u; + hed_u.len_d = red_u->len_d; + hed_u.mug_l = red_u->mug_l; + + ret_zs = pwrite(fid_i, &hed_u, sizeof(u3_book_deed_head), now_w); + if ( ret_zs != sizeof(u3_book_deed_head) ) { + return c3n; + } + now_w += sizeof(u3_book_deed_head); + + // write jam data + ret_zs = pwrite(fid_i, red_u->jam_y, jaz_d, now_w); + if ( ret_zs != (c3_zs)jaz_d ) { return c3n; } - off_now += sizeof(c3_d); + now_w += jaz_d; + + // write deed_tail + u3_book_deed_tail tal_u; + tal_u.crc_w = red_u->crc_w; + tal_u.let_d = red_u->len_d; // length trailer (same as len_d) + + ret_zs = pwrite(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_w); + if ( ret_zs != sizeof(u3_book_deed_tail) ) { + return c3n; + } + now_w += sizeof(u3_book_deed_tail); // update offset - *off_w = off_now; + *off_w = now_w; + + return c3y; +} + +/* _book_skip_deed(): skip over deed without reading jam data. +** +** returns: +** c3y: success +** c3n: failure (EOF) +*/ +static c3_o +_book_skip_deed(c3_i fid_i, c3_w* off_w) +{ + c3_zs ret_zs; + c3_d len_d; + + // read only the len_d field + ret_zs = pread(fid_i, &len_d, sizeof(c3_d), *off_w); + if ( ret_zs != sizeof(c3_d) ) { + return c3n; + } + + // skip entire deed: deed_head + jam + deed_tail + *off_w += _book_deed_size(len_d); return c3y; } @@ -202,86 +305,63 @@ _book_read_record(c3_i fid_i, ** updates header if corruption detected. */ static c3_w -_book_scan_end(u3_book* log_u) +_book_scan_end(u3_book* txt_u) { - c3_w off_w = sizeof(u3_book_head) + BOOK_META_SIZE; // events start here - c3_d count_d = 0; - c3_d expected_d; + c3_w off_w = sizeof(u3_book_head) + BOOK_META_SIZE; // start + c3_d cot_d = 0; // count + c3_d exp_d; // expected event number - if ( 0 == log_u->hed_u.fir_d && 0 == log_u->hed_u.las_d ) { + if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->hed_u.las_d ) { // empty log return off_w; } - expected_d = log_u->hed_u.las_d - log_u->hed_u.fir_d + 1; + exp_d = txt_u->hed_u.las_d - txt_u->hed_u.fir_d + 1; while ( 1 ) { - c3_d len_d, let_d; - c3_y* mug_y; - c3_y* jam_y; - c3_w crc_w, calc_crc; + u3_book_reed red_u; c3_w off_start = off_w; - if ( c3n == _book_read_record(log_u->fid_i, &off_w, - &len_d, &mug_y, &jam_y, - &crc_w, &let_d) ) - { + // read deed into reed + if ( c3n == _book_read_deed(txt_u->fid_i, &off_w, &red_u) ) { // EOF or read error break; } - // validate len_d == let_d - if ( len_d != let_d ) { - fprintf(stderr, "book: length mismatch at offset %u\r\n", off_start); - c3_free(mug_y); - c3_free(jam_y); - break; - } - - // validate CRC: CRC32(len_d || mug || jam) - { - c3_y buf_y[12]; // 8 bytes len_d + 4 bytes mug - memcpy(buf_y, &len_d, 8); - memcpy(buf_y + 8, mug_y, 4); - - calc_crc = _book_crc32(buf_y, 12); - calc_crc = (c3_w)crc32(calc_crc, jam_y, len_d - 4); - } - - c3_free(mug_y); - c3_free(jam_y); - - if ( crc_w != calc_crc ) { - fprintf(stderr, "book: CRC mismatch at offset %u\r\n", off_start); + // validate reed (CRC and length checks) + if ( c3n == _book_okay_reed(&red_u) ) { + fprintf(stderr, "book: validation failed at offset %u\r\n", off_start); + c3_free(red_u.jam_y); break; } - count_d++; + c3_free(red_u.jam_y); + cot_d++; } // check if we found fewer events than expected - if ( count_d != expected_d ) { + if ( cot_d != exp_d ) { fprintf(stderr, "book: recovery: found %llu events, expected %llu\r\n", - count_d, expected_d); + cot_d, exp_d); // update header - if ( count_d == 0 ) { - log_u->hed_u.fir_d = 0; - log_u->hed_u.las_d = 0; + if ( cot_d == 0 ) { + txt_u->hed_u.fir_d = 0; + txt_u->hed_u.las_d = 0; off_w = sizeof(u3_book_head); } else { - log_u->hed_u.las_d = log_u->hed_u.fir_d + count_d - 1; + txt_u->hed_u.las_d = txt_u->hed_u.fir_d + cot_d - 1; } - log_u->dit_o = c3y; - _book_write_header(log_u); + txt_u->dit_o = c3y; + _book_save_head(txt_u); // truncate file - if ( -1 == ftruncate(log_u->fid_i, off_w) ) { + if ( -1 == ftruncate(txt_u->fid_i, off_w) ) { fprintf(stderr, "book: failed to truncate: %s\r\n", strerror(errno)); } else { - c3_sync(log_u->fid_i); + c3_sync(txt_u->fid_i); } } @@ -296,7 +376,7 @@ u3_book_init(const c3_c* pax_c) c3_c path_c[8193]; c3_i fid_i; struct stat buf_u; - u3_book* log_u; + u3_book* txt_u; // construct path to book.log snprintf(path_c, sizeof(path_c), "%s/book.log", pax_c); @@ -317,76 +397,76 @@ u3_book_init(const c3_c* pax_c) } // allocate log structure - log_u = c3_calloc(sizeof(u3_book)); - log_u->fid_i = fid_i; - log_u->pax_c = c3_malloc(strlen(path_c) + 1); - strcpy(log_u->pax_c, path_c); + txt_u = c3_calloc(sizeof(u3_book)); + txt_u->fid_i = fid_i; + txt_u->pax_c = c3_malloc(strlen(path_c) + 1); + strcpy(txt_u->pax_c, path_c); if ( buf_u.st_size == 0 ) { // new file: initialize header - _book_init_header(log_u); - _book_write_header(log_u); + _book_init_head(txt_u); + _book_save_head(txt_u); // events start after header + reserved metadata area - log_u->off_w = sizeof(u3_book_head) + BOOK_META_SIZE; + txt_u->off_w = sizeof(u3_book_head) + BOOK_META_SIZE; } else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { // corrupt file: too small fprintf(stderr, "book: file too small: %lld bytes\r\n", (long long)buf_u.st_size); close(fid_i); - c3_free(log_u->pax_c); - c3_free(log_u); + c3_free(txt_u->pax_c); + c3_free(txt_u); return 0; } else { // existing file: read and validate header - if ( c3n == _book_read_header(log_u) ) { + if ( c3n == _book_read_head(txt_u) ) { close(fid_i); - c3_free(log_u->pax_c); - c3_free(log_u); + c3_free(txt_u->pax_c); + c3_free(txt_u); return 0; } // scan to find actual end, recover from corruption - log_u->off_w = _book_scan_end(log_u); + txt_u->off_w = _book_scan_end(txt_u); } - return log_u; + return txt_u; } /* u3_book_exit(): close event log. */ void -u3_book_exit(u3_book* log_u) +u3_book_exit(u3_book* txt_u) { - if ( !log_u ) { + if ( !txt_u ) { return; } // sync header if dirty - if ( c3y == log_u->dit_o ) { - _book_write_header(log_u); + if ( c3y == txt_u->dit_o ) { + _book_save_head(txt_u); } // close file - close(log_u->fid_i); + close(txt_u->fid_i); // free resources - c3_free(log_u->pax_c); - c3_free(log_u); + c3_free(txt_u->pax_c); + c3_free(txt_u); } /* u3_book_gulf(): read first and last event numbers. */ c3_o -u3_book_gulf(u3_book* log_u, c3_d* low_d, c3_d* hig_d) +u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d) { - if ( !log_u ) { + if ( !txt_u ) { return c3n; } - *low_d = log_u->hed_u.fir_d; - *hig_d = log_u->hed_u.las_d; + *low_d = txt_u->hed_u.fir_d; + *hig_d = txt_u->hed_u.las_d; return c3y; } @@ -449,141 +529,84 @@ u3_book_stat(const c3_c* pax_c) /* u3_book_save(): save [len_d] events starting at [eve_d]. ** -** byt_p: array of buffers (mug + jam format) +** byt_p: array of buffers (mug + jam) ** siz_i: array of buffer sizes */ c3_o -u3_book_save(u3_book* log_u, +u3_book_save(u3_book* txt_u, c3_d eve_d, c3_d len_d, void** byt_p, c3_z* siz_i, c3_d epo_d) { - c3_w i; - c3_w off_now; + c3_w now_w; - if ( !log_u ) { + if ( !txt_u ) { return c3n; } // validate contiguity - if ( 0 == log_u->hed_u.las_d ) { + if ( 0 == txt_u->hed_u.las_d ) { // empty log: first event must be the first event in the epoch if ( epo_d + 1 != eve_d ) { fprintf(stderr, "book: first event must be 1, got %llu\r\n", eve_d); return c3n; } - log_u->hed_u.fir_d = eve_d; + txt_u->hed_u.fir_d = eve_d; } else { // non-empty: must be contiguous - if ( eve_d != log_u->hed_u.las_d + 1 ) { + if ( eve_d != txt_u->hed_u.las_d + 1 ) { fprintf(stderr, "book: event gap: expected %llu, got %llu\r\n", - log_u->hed_u.las_d + 1, eve_d); + txt_u->hed_u.las_d + 1, eve_d); return c3n; } } - // write each event record - off_now = log_u->off_w; + // write each event deed + now_w = txt_u->off_w; - for ( i = 0; i < len_d; i++ ) { - c3_y* buf_y = (c3_y*)byt_p[i]; - c3_d siz_d = (c3_d)siz_i[i]; - c3_d len_write; - c3_l mug_l; - c3_y* jam_y; - c3_d jam_len; - c3_w crc_w; - c3_zs ret_zs; - c3_y len_buf[8]; - c3_y crc_buf[4]; - c3_y let_buf[8]; + for ( c3_w i_w = 0; i_w < len_d; i_w++ ) { + c3_y* buf_y = (c3_y*)byt_p[i_w]; + c3_d siz_d = (c3_d)siz_i[i_w]; + u3_book_reed red_u; // extract mug from buffer (first 4 bytes) if ( siz_d < 4 ) { fprintf(stderr, "book: event %llu buffer too small: %llu\r\n", - eve_d + i, siz_d); + eve_d + i_w, siz_d); return c3n; } - memcpy(&mug_l, buf_y, 4); - jam_y = buf_y + 4; - jam_len = siz_d - 4; - - // len_d is total payload: 4 bytes mug + jam data - len_write = siz_d; - - // compute CRC32 over: len_d (8 bytes) + mug_l (4 bytes) + jam data - { - c3_y tmp_buf[12]; - memcpy(tmp_buf, &len_write, 8); - memcpy(tmp_buf + 8, &mug_l, 4); - crc_w = _book_crc32_two(tmp_buf, 12, jam_y, jam_len); - } - - // prepare buffers for writing - memcpy(len_buf, &len_write, 8); - memcpy(crc_buf, &crc_w, 4); - memcpy(let_buf, &len_write, 8); + // build reed from input buffer + memcpy(&red_u.mug_l, buf_y, 4); + red_u.jam_y = buf_y + 4; + red_u.len_d = siz_d; // total payload: mug + jam + red_u.crc_w = _book_calc_crc(&red_u); - // write record: len_d | mug_l | jam | crc_m | let_d - ret_zs = pwrite(log_u->fid_i, len_buf, 8, off_now); - if ( ret_zs != 8 ) { - fprintf(stderr, "book: failed to write len_d for event %llu: %s\r\n", - eve_d + i, strerror(errno)); + // save deed to file + if ( c3n == _book_save_deed(txt_u->fid_i, &now_w, &red_u) ) { + fprintf(stderr, "book: failed to save deed for event %llu: %s\r\n", + eve_d + i_w, strerror(errno)); return c3n; } - off_now += 8; - - ret_zs = pwrite(log_u->fid_i, &mug_l, 4, off_now); - if ( ret_zs != 4 ) { - fprintf(stderr, "book: failed to write mug for event %llu: %s\r\n", - eve_d + i, strerror(errno)); - return c3n; - } - off_now += 4; - - ret_zs = pwrite(log_u->fid_i, jam_y, jam_len, off_now); - if ( ret_zs != (c3_zs)jam_len ) { - fprintf(stderr, "book: failed to write jam for event %llu: %s\r\n", - eve_d + i, strerror(errno)); - return c3n; - } - off_now += jam_len; - - ret_zs = pwrite(log_u->fid_i, crc_buf, 4, off_now); - if ( ret_zs != 4 ) { - fprintf(stderr, "book: failed to write crc for event %llu: %s\r\n", - eve_d + i, strerror(errno)); - return c3n; - } - off_now += 4; - - ret_zs = pwrite(log_u->fid_i, let_buf, 8, off_now); - if ( ret_zs != 8 ) { - fprintf(stderr, "book: failed to write let_d for event %llu: %s\r\n", - eve_d + i, strerror(errno)); - return c3n; - } - off_now += 8; } // sync data to disk - if ( -1 == c3_sync(log_u->fid_i) ) { + if ( -1 == c3_sync(txt_u->fid_i) ) { fprintf(stderr, "book: failed to sync events: %s\r\n", strerror(errno)); return c3n; } // update header - log_u->hed_u.las_d = eve_d + len_d - 1; - log_u->off_w = off_now; - log_u->dit_o = c3y; + txt_u->hed_u.las_d = eve_d + len_d - 1; + txt_u->off_w = now_w; + txt_u->dit_o = c3y; // write and sync header - if ( c3n == _book_write_header(log_u) ) { + if ( c3n == _book_save_head(txt_u) ) { return c3n; } @@ -599,7 +622,7 @@ u3_book_save(u3_book* log_u, ** buf_v: buffer pointer (mug + jam format) */ c3_o -u3_book_read(u3_book* log_u, +u3_book_read(u3_book* txt_u, void* ptr_v, c3_d eve_d, c3_d len_d, @@ -607,100 +630,67 @@ u3_book_read(u3_book* log_u, { c3_w off_w; c3_d cur_d; - c3_d i; - if ( !log_u ) { + if ( !txt_u ) { return c3n; } // validate range - if ( 0 == log_u->hed_u.las_d ) { + if ( 0 == txt_u->hed_u.las_d ) { // empty log fprintf(stderr, "book: read from empty log\r\n"); return c3n; } - if ( eve_d < log_u->hed_u.fir_d || eve_d > log_u->hed_u.las_d ) { + if ( eve_d < txt_u->hed_u.fir_d || eve_d > txt_u->hed_u.las_d ) { fprintf(stderr, "book: event %llu out of range [%llu, %llu]\r\n", - eve_d, log_u->hed_u.fir_d, log_u->hed_u.las_d); + eve_d, txt_u->hed_u.fir_d, txt_u->hed_u.las_d); return c3n; } - if ( eve_d + len_d - 1 > log_u->hed_u.las_d ) { + if ( eve_d + len_d - 1 > txt_u->hed_u.las_d ) { fprintf(stderr, "book: read range exceeds last event\r\n"); return c3n; } // scan to starting event (events start after header + metadata area) off_w = sizeof(u3_book_head) + BOOK_META_SIZE; - cur_d = log_u->hed_u.fir_d; + cur_d = txt_u->hed_u.fir_d; while ( cur_d < eve_d ) { - c3_d skip_len; - c3_zs ret_zs; - - ret_zs = pread(log_u->fid_i, &skip_len, sizeof(c3_d), off_w); - if ( ret_zs != sizeof(c3_d) ) { + if ( c3n == _book_skip_deed(txt_u->fid_i, &off_w) ) { fprintf(stderr, "book: failed to scan to event %llu\r\n", eve_d); return c3n; } - - // skip entire record: len_d(8) + mug(4) + jam(len-4) + crc(4) + let_d(8) - off_w += 8 + 4 + (skip_len - 4) + 4 + 8; cur_d++; } // read requested events - for ( i = 0; i < len_d; i++, cur_d++ ) { - c3_d len_rec; - c3_y* mug_y; - c3_y* jam_y; - c3_w crc_w, calc_crc; - c3_d let_d; + for ( c3_d i_d = 0; i_d < len_d; i_d++, cur_d++ ) { + u3_book_reed red_u; c3_y* buf_y; c3_z len_z; - // read record - if ( c3n == _book_read_record(log_u->fid_i, &off_w, - &len_rec, &mug_y, &jam_y, - &crc_w, &let_d) ) - { + // read deed into reed + if ( c3n == _book_read_deed(txt_u->fid_i, &off_w, &red_u) ) { fprintf(stderr, "book: failed to read event %llu\r\n", cur_d); return c3n; } - // validate len_d == let_d - if ( len_rec != let_d ) { - fprintf(stderr, "book: length mismatch at event %llu\r\n", cur_d); - c3_free(mug_y); - c3_free(jam_y); - return c3n; - } - - // validate CRC - { - c3_y tmp_buf[12]; - memcpy(tmp_buf, &len_rec, 8); - memcpy(tmp_buf + 8, mug_y, 4); - calc_crc = _book_crc32(tmp_buf, 12); - calc_crc = (c3_w)crc32(calc_crc, jam_y, len_rec - 4); - } - - if ( crc_w != calc_crc ) { - fprintf(stderr, "book: CRC mismatch at event %llu\r\n", cur_d); - c3_free(mug_y); - c3_free(jam_y); + // validate reed + if ( c3n == _book_okay_reed(&red_u) ) { + fprintf(stderr, "book: validation failed at event %llu\r\n", cur_d); + c3_free(red_u.jam_y); return c3n; } // reconstruct buffer in mug + jam format for callback - len_z = len_rec; + len_z = red_u.len_d; buf_y = c3_malloc(len_z); - memcpy(buf_y, mug_y, 4); - memcpy(buf_y + 4, jam_y, len_rec - 4); + memcpy(buf_y, &red_u.mug_l, 4); + memcpy(buf_y + 4, red_u.jam_y, red_u.len_d - 4); - c3_free(mug_y); - c3_free(jam_y); + c3_free(red_u.jam_y); // invoke callback if ( c3n == read_f(ptr_v, cur_d, len_z, buf_y) ) { @@ -719,7 +709,7 @@ u3_book_read(u3_book* log_u, ** sets up iterator to read events from [nex_d] to [las_d] inclusive. */ c3_o -u3_book_walk_init(u3_book* log_u, +u3_book_walk_init(u3_book* txt_u, u3_book_walk* itr_u, c3_d nex_d, c3_d las_d) @@ -727,49 +717,42 @@ u3_book_walk_init(u3_book* log_u, c3_w off_w; c3_d cur_d; - if ( !log_u || !itr_u ) { + if ( !txt_u || !itr_u ) { return c3n; } // validate range - if ( 0 == log_u->hed_u.las_d ) { + if ( 0 == txt_u->hed_u.las_d ) { fprintf(stderr, "book: walk_init on empty log\r\n"); return c3n; } - if ( nex_d < log_u->hed_u.fir_d || nex_d > log_u->hed_u.las_d ) { + if ( nex_d < txt_u->hed_u.fir_d || nex_d > txt_u->hed_u.las_d ) { fprintf(stderr, "book: walk_init start %llu out of range [%llu, %llu]\r\n", - nex_d, log_u->hed_u.fir_d, log_u->hed_u.las_d); + nex_d, txt_u->hed_u.fir_d, txt_u->hed_u.las_d); return c3n; } - if ( las_d < nex_d || las_d > log_u->hed_u.las_d ) { + if ( las_d < nex_d || las_d > txt_u->hed_u.las_d ) { fprintf(stderr, "book: walk_init end %llu out of range [%llu, %llu]\r\n", - las_d, nex_d, log_u->hed_u.las_d); + las_d, nex_d, txt_u->hed_u.las_d); return c3n; } // scan to starting event (events start after header + metadata area) off_w = sizeof(u3_book_head) + BOOK_META_SIZE; - cur_d = log_u->hed_u.fir_d; + cur_d = txt_u->hed_u.fir_d; while ( cur_d < nex_d ) { - c3_d skip_len; - c3_zs ret_zs; - - ret_zs = pread(log_u->fid_i, &skip_len, sizeof(c3_d), off_w); - if ( ret_zs != sizeof(c3_d) ) { + if ( c3n == _book_skip_deed(txt_u->fid_i, &off_w) ) { fprintf(stderr, "book: walk_init failed to scan to event %llu\r\n", nex_d); return c3n; } - - // skip entire record - off_w += 8 + 4 + (skip_len - 4) + 4 + 8; cur_d++; } // initialize iterator - itr_u->fid_i = log_u->fid_i; + itr_u->fid_i = txt_u->fid_i; itr_u->nex_d = nex_d; itr_u->las_d = las_d; itr_u->off_w = off_w; @@ -786,11 +769,7 @@ u3_book_walk_init(u3_book* log_u, c3_o u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) { - c3_d len_rec; - c3_y* mug_y; - c3_y* jam_y; - c3_w crc_w, calc_crc; - c3_d let_d; + u3_book_reed red_u; c3_y* buf_y; if ( !itr_u || c3n == itr_u->liv_o ) { @@ -803,53 +782,30 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) return c3n; } - // read record - if ( c3n == _book_read_record(itr_u->fid_i, &itr_u->off_w, - &len_rec, &mug_y, &jam_y, - &crc_w, &let_d) ) - { + // read deed into reed + if ( c3n == _book_read_deed(itr_u->fid_i, &itr_u->off_w, &red_u) ) { fprintf(stderr, "book: walk_next failed to read event %llu\r\n", itr_u->nex_d); itr_u->liv_o = c3n; return c3n; } - // validate len_d == let_d - if ( len_rec != let_d ) { - fprintf(stderr, "book: walk_next length mismatch at event %llu\r\n", + // validate reed + if ( c3n == _book_okay_reed(&red_u) ) { + fprintf(stderr, "book: walk_next validation failed at event %llu\r\n", itr_u->nex_d); - c3_free(mug_y); - c3_free(jam_y); - itr_u->liv_o = c3n; - return c3n; - } - - // validate CRC - { - c3_y tmp_buf[12]; - memcpy(tmp_buf, &len_rec, 8); - memcpy(tmp_buf + 8, mug_y, 4); - calc_crc = _book_crc32(tmp_buf, 12); - calc_crc = (c3_w)crc32(calc_crc, jam_y, len_rec - 4); - } - - if ( crc_w != calc_crc ) { - fprintf(stderr, "book: walk_next CRC mismatch at event %llu\r\n", - itr_u->nex_d); - c3_free(mug_y); - c3_free(jam_y); + c3_free(red_u.jam_y); itr_u->liv_o = c3n; return c3n; } // reconstruct buffer in mug + jam format - *len_z = len_rec; + *len_z = red_u.len_d; buf_y = c3_malloc(*len_z); - memcpy(buf_y, mug_y, 4); - memcpy(buf_y + 4, jam_y, len_rec - 4); + memcpy(buf_y, &red_u.mug_l, 4); + memcpy(buf_y + 4, red_u.jam_y, red_u.len_d - 4); - c3_free(mug_y); - c3_free(jam_y); + c3_free(red_u.jam_y); *buf_v = buf_y; @@ -878,38 +834,37 @@ u3_book_walk_done(u3_book_walk* itr_u) ** invokes callback with (ptr_v, len, data) or (ptr_v, -1, 0) if not found. */ void -u3_book_read_meta(u3_book* log_u, +u3_book_read_meta(u3_book* txt_u, void* ptr_v, const c3_c* key_c, void (*read_f)(void*, c3_zs, void*)) { - c3_w key_len; - c3_y* meta_buf; - c3_w meta_len; + c3_w ken_w; // key length + c3_y* buf_y; // metadata buffer + c3_w len_w; // metadata length c3_zs ret_zs; - c3_w offset; - c3_w count; - c3_w i; + c3_w off_w; + c3_w cot_w; // count - if ( !log_u ) { + if ( !txt_u ) { read_f(ptr_v, -1, 0); return; } // check if metadata section exists - if ( 0 == log_u->hed_u.len_w ) { + if ( 0 == txt_u->hed_u.len_w ) { read_f(ptr_v, -1, 0); return; } // read entire metadata section - meta_len = log_u->hed_u.len_w; - meta_buf = c3_malloc(meta_len); + len_w = txt_u->hed_u.len_w; + buf_y = c3_malloc(len_w); - ret_zs = pread(log_u->fid_i, meta_buf, meta_len, log_u->hed_u.off_w); - if ( ret_zs != (c3_zs)meta_len ) { + ret_zs = pread(txt_u->fid_i, buf_y, len_w, txt_u->hed_u.off_w); + if ( ret_zs != (c3_zs)len_w ) { fprintf(stderr, "book: read_meta: failed to read metadata section\r\n"); - c3_free(meta_buf); + c3_free(buf_y); read_f(ptr_v, -1, 0); return; } @@ -918,78 +873,78 @@ u3_book_read_meta(u3_book* log_u, // format: [4 bytes: count] + entries // entry: [4 bytes: key_len][key][4 bytes: val_len][val] - if ( meta_len < 4 ) { + if ( len_w < 4 ) { fprintf(stderr, "book: read_meta: metadata section too small\r\n"); - c3_free(meta_buf); + c3_free(buf_y); read_f(ptr_v, -1, 0); return; } - memcpy(&count, meta_buf, 4); - offset = 4; + memcpy(&cot_w, buf_y, 4); + off_w = 4; - key_len = strlen(key_c); + ken_w = strlen(key_c); // linear search for key - for ( i = 0; i < count; i++ ) { + for ( c3_w i_w = 0; i_w < cot_w; i_w++ ) { c3_w entry_key_len; c3_y* entry_key; c3_w entry_val_len; c3_y* entry_val; // read key length - if ( offset + 4 > meta_len ) { + if ( off_w + 4 > len_w ) { fprintf(stderr, "book: read_meta: corrupt metadata (key len)\r\n"); - c3_free(meta_buf); + c3_free(buf_y); read_f(ptr_v, -1, 0); return; } - memcpy(&entry_key_len, meta_buf + offset, 4); - offset += 4; + memcpy(&entry_key_len, buf_y + off_w, 4); + off_w += 4; // read key - if ( offset + entry_key_len > meta_len ) { + if ( off_w + entry_key_len > len_w ) { fprintf(stderr, "book: read_meta: corrupt metadata (key)\r\n"); - c3_free(meta_buf); + c3_free(buf_y); read_f(ptr_v, -1, 0); return; } - entry_key = meta_buf + offset; - offset += entry_key_len; + entry_key = buf_y + off_w; + off_w += entry_key_len; // read value length - if ( offset + 4 > meta_len ) { + if ( off_w + 4 > len_w ) { fprintf(stderr, "book: read_meta: corrupt metadata (val len)\r\n"); - c3_free(meta_buf); + c3_free(buf_y); read_f(ptr_v, -1, 0); return; } - memcpy(&entry_val_len, meta_buf + offset, 4); - offset += 4; + memcpy(&entry_val_len, buf_y + off_w, 4); + off_w += 4; // read value - if ( offset + entry_val_len > meta_len ) { + if ( off_w + entry_val_len > len_w ) { fprintf(stderr, "book: read_meta: corrupt metadata (val)\r\n"); - c3_free(meta_buf); + c3_free(buf_y); read_f(ptr_v, -1, 0); return; } - entry_val = meta_buf + offset; - offset += entry_val_len; + entry_val = buf_y + off_w; + off_w += entry_val_len; // check if this is the key we're looking for - if ( entry_key_len == key_len && - 0 == memcmp(entry_key, key_c, key_len) ) + if ( entry_key_len == ken_w && + 0 == memcmp(entry_key, key_c, ken_w) ) { // found it - invoke callback read_f(ptr_v, entry_val_len, entry_val); - c3_free(meta_buf); + c3_free(buf_y); return; } } // not found - c3_free(meta_buf); + c3_free(buf_y); read_f(ptr_v, -1, 0); } @@ -998,7 +953,7 @@ u3_book_read_meta(u3_book* log_u, ** updates or inserts key-value pair in metadata section. */ c3_o -u3_book_save_meta(u3_book* log_u, +u3_book_save_meta(u3_book* txt_u, const c3_c* key_c, c3_z val_z, void* val_p) @@ -1011,22 +966,21 @@ u3_book_save_meta(u3_book* log_u, c3_w new_len; c3_w new_count; c3_w offset; - c3_w i; c3_o found = c3n; c3_zs ret_zs; - if ( !log_u ) { + if ( !txt_u ) { return c3n; } key_len = strlen(key_c); // read existing metadata if present - if ( 0 != log_u->hed_u.len_w ) { - old_len = log_u->hed_u.len_w; + if ( 0 != txt_u->hed_u.len_w ) { + old_len = txt_u->hed_u.len_w; old_meta = c3_malloc(old_len); - ret_zs = pread(log_u->fid_i, old_meta, old_len, log_u->hed_u.off_w); + ret_zs = pread(txt_u->fid_i, old_meta, old_len, txt_u->hed_u.off_w); if ( ret_zs != (c3_zs)old_len ) { fprintf(stderr, "book: save_meta: failed to read old metadata\r\n"); c3_free(old_meta); @@ -1049,7 +1003,7 @@ u3_book_save_meta(u3_book* log_u, // add existing entries (except if we're updating) if ( old_meta ) { offset = 4; - for ( i = 0; i < old_count; i++ ) { + for ( c3_w i_w = 0; i_w < old_count; i_w++ ) { c3_w entry_key_len, entry_val_len; if ( offset + 4 > old_len ) break; @@ -1096,7 +1050,7 @@ u3_book_save_meta(u3_book* log_u, // copy existing entries (except updated one) if ( old_meta ) { c3_w old_offset = 4; - for ( i = 0; i < old_count; i++ ) { + for ( c3_w i_w = 0; i_w < old_count; i_w++ ) { c3_w entry_key_len, entry_val_len; if ( old_offset + 4 > old_len ) break; @@ -1161,7 +1115,7 @@ u3_book_save_meta(u3_book* log_u, return c3n; } - ret_zs = pwrite(log_u->fid_i, new_meta, new_len, new_off); + ret_zs = pwrite(txt_u->fid_i, new_meta, new_len, new_off); if ( ret_zs != (c3_zs)new_len ) { fprintf(stderr, "book: save_meta: failed to write metadata: %s\r\n", strerror(errno)); @@ -1174,19 +1128,19 @@ u3_book_save_meta(u3_book* log_u, if ( old_meta ) c3_free(old_meta); // sync metadata - if ( -1 == c3_sync(log_u->fid_i) ) { + if ( -1 == c3_sync(txt_u->fid_i) ) { fprintf(stderr, "book: save_meta: failed to sync metadata: %s\r\n", strerror(errno)); return c3n; } // update header - log_u->hed_u.off_w = new_off; - log_u->hed_u.len_w = new_len; - log_u->dit_o = c3y; + txt_u->hed_u.off_w = new_off; + txt_u->hed_u.len_w = new_len; + txt_u->dit_o = c3y; // write and sync header - if ( c3n == _book_write_header(log_u) ) { + if ( c3n == _book_save_head(txt_u) ) { return c3n; } diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index b08ffe9e57..1fa1f38695 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -39,6 +39,40 @@ c3_o liv_o; // iterator valid } u3_book_walk; + /* u3_book_deed_head: on-disk deed header + */ + typedef struct _u3_book_deed_head { + c3_d len_d; // payload size (mug + jam) + c3_l mug_l; // mug/hash + } u3_book_deed_head; + + /* u3_book_deed_tail: on-disk deed trailer + */ + typedef struct _u3_book_deed_tail { + c3_w crc_w; // CRC32 checksum + c3_d let_d; // length trailer (validates len_d) + } u3_book_deed_tail; + + /* u3_book_deed: complete on-disk event record + ** + ** NB: not used directly for I/O due to variable-length jam data + ** Actual format: deed_head | jam_data | deed_tail + */ + typedef struct _u3_book_deed { + u3_book_deed_head hed_u; + // c3_y jam_y[]; // variable-length jam data + u3_book_deed_tail tal_u; + } u3_book_deed; + + /* u3_book_reed: in-memory event record representation for I/O + */ + typedef struct _u3_book_reed { + c3_d len_d; // total payload size + c3_l mug_l; // mug/hash + c3_y* jam_y; // jam data (caller owns, len = len_d - 4) + c3_w crc_w; // CRC32 checksum + } u3_book_reed; + /* u3_book_init(): open/create event log at [pax_c]. */ u3_book* @@ -47,7 +81,7 @@ /* u3_book_exit(): close event log. */ void - u3_book_exit(u3_book* log_u); + u3_book_exit(u3_book* txt_u); /* u3_book_stat(): print book stats. */ @@ -57,12 +91,12 @@ /* u3_book_gulf(): read first and last event numbers. */ c3_o - u3_book_gulf(u3_book* log_u, c3_d* low_d, c3_d* hig_d); + u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d); /* u3_book_read(): read [len_d] events starting at [eve_d]. */ c3_o - u3_book_read(u3_book* log_u, + u3_book_read(u3_book* txt_u, void* ptr_v, c3_d eve_d, c3_d len_d, @@ -71,7 +105,7 @@ /* u3_book_save(): save [len_d] events starting at [eve_d]. */ c3_o - u3_book_save(u3_book* log_u, + u3_book_save(u3_book* txt_u, c3_d eve_d, c3_d len_d, void** byt_p, @@ -81,7 +115,7 @@ /* u3_book_read_meta(): read metadata by string key from log. */ void - u3_book_read_meta(u3_book* log_u, + u3_book_read_meta(u3_book* txt_u, void* ptr_v, const c3_c* key_c, void (*read_f)(void*, c3_zs, void*)); @@ -89,7 +123,7 @@ /* u3_book_save_meta(): save metadata by string key into log. */ c3_o - u3_book_save_meta(u3_book* log_u, + u3_book_save_meta(u3_book* txt_u, const c3_c* key_c, c3_z val_z, void* val_p); @@ -97,7 +131,7 @@ /* u3_book_walk_init(): initialize event iterator. */ c3_o - u3_book_walk_init(u3_book* log_u, + u3_book_walk_init(u3_book* txt_u, u3_book_walk* itr_u, c3_d nex_d, c3_d las_d); From 6bed2670e82dad83fff31aca21604d2cfb0dd320 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 2 Jan 2026 09:11:08 -0500 Subject: [PATCH 04/38] book: uses `PRIu64` instead of `llu` format specifier for portability --- pkg/vere/db/book.c | 36 ++++++++++++++++++------------------ pkg/vere/db/book.h | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 11c582f46b..71d9c99957 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -183,7 +183,7 @@ _book_read_deed(c3_i fid_i, c3_w* off_w, u3_book_reed* red_u) // validate length if ( 0 == hed_u.len_d || (1ULL << 32) < hed_u.len_d ) { - fprintf(stderr, "book: invalid length: %llu\r\n", hed_u.len_d); + fprintf(stderr, "book: invalid length: %" PRIu64 "\r\n", hed_u.len_d); return c3n; } @@ -341,7 +341,7 @@ _book_scan_end(u3_book* txt_u) // check if we found fewer events than expected if ( cot_d != exp_d ) { - fprintf(stderr, "book: recovery: found %llu events, expected %llu\r\n", + fprintf(stderr, "book: recovery: found %" PRIu64 " events, expected %" PRIu64 "\r\n", cot_d, exp_d); // update header @@ -515,9 +515,9 @@ u3_book_stat(const c3_c* pax_c) fprintf(stderr, "book info:\r\n"); fprintf(stderr, " file: %s\r\n", pax_c); fprintf(stderr, " version: %u\r\n", hed_u.ver_w); - fprintf(stderr, " first event: %llu\r\n", hed_u.fir_d); - fprintf(stderr, " last event: %llu\r\n", hed_u.las_d); - fprintf(stderr, " event count: %llu\r\n", + fprintf(stderr, " first event: %" PRIu64 "\r\n", hed_u.fir_d); + fprintf(stderr, " last event: %" PRIu64 "\r\n", hed_u.las_d); + fprintf(stderr, " event count: %" PRIu64 "\r\n", (0 == hed_u.las_d ) ? 0 : (hed_u.las_d - hed_u.fir_d + 1)); fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); @@ -550,7 +550,7 @@ u3_book_save(u3_book* txt_u, if ( 0 == txt_u->hed_u.las_d ) { // empty log: first event must be the first event in the epoch if ( epo_d + 1 != eve_d ) { - fprintf(stderr, "book: first event must be 1, got %llu\r\n", eve_d); + fprintf(stderr, "book: first event must be 1, got %" PRIu64 "\r\n", eve_d); return c3n; } txt_u->hed_u.fir_d = eve_d; @@ -558,7 +558,7 @@ u3_book_save(u3_book* txt_u, else { // non-empty: must be contiguous if ( eve_d != txt_u->hed_u.las_d + 1 ) { - fprintf(stderr, "book: event gap: expected %llu, got %llu\r\n", + fprintf(stderr, "book: event gap: expected %" PRIu64 ", got %" PRIu64 "\r\n", txt_u->hed_u.las_d + 1, eve_d); return c3n; } @@ -574,7 +574,7 @@ u3_book_save(u3_book* txt_u, // extract mug from buffer (first 4 bytes) if ( siz_d < 4 ) { - fprintf(stderr, "book: event %llu buffer too small: %llu\r\n", + fprintf(stderr, "book: event %" PRIu64 " buffer too small: %" PRIu64 "\r\n", eve_d + i_w, siz_d); return c3n; } @@ -587,7 +587,7 @@ u3_book_save(u3_book* txt_u, // save deed to file if ( c3n == _book_save_deed(txt_u->fid_i, &now_w, &red_u) ) { - fprintf(stderr, "book: failed to save deed for event %llu: %s\r\n", + fprintf(stderr, "book: failed to save deed for event %" PRIu64 ": %s\r\n", eve_d + i_w, strerror(errno)); return c3n; } @@ -643,7 +643,7 @@ u3_book_read(u3_book* txt_u, } if ( eve_d < txt_u->hed_u.fir_d || eve_d > txt_u->hed_u.las_d ) { - fprintf(stderr, "book: event %llu out of range [%llu, %llu]\r\n", + fprintf(stderr, "book: event %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", eve_d, txt_u->hed_u.fir_d, txt_u->hed_u.las_d); return c3n; } @@ -659,7 +659,7 @@ u3_book_read(u3_book* txt_u, while ( cur_d < eve_d ) { if ( c3n == _book_skip_deed(txt_u->fid_i, &off_w) ) { - fprintf(stderr, "book: failed to scan to event %llu\r\n", eve_d); + fprintf(stderr, "book: failed to scan to event %" PRIu64 "\r\n", eve_d); return c3n; } cur_d++; @@ -673,13 +673,13 @@ u3_book_read(u3_book* txt_u, // read deed into reed if ( c3n == _book_read_deed(txt_u->fid_i, &off_w, &red_u) ) { - fprintf(stderr, "book: failed to read event %llu\r\n", cur_d); + fprintf(stderr, "book: failed to read event %" PRIu64 "\r\n", cur_d); return c3n; } // validate reed if ( c3n == _book_okay_reed(&red_u) ) { - fprintf(stderr, "book: validation failed at event %llu\r\n", cur_d); + fprintf(stderr, "book: validation failed at event %" PRIu64 "\r\n", cur_d); c3_free(red_u.jam_y); return c3n; } @@ -728,13 +728,13 @@ u3_book_walk_init(u3_book* txt_u, } if ( nex_d < txt_u->hed_u.fir_d || nex_d > txt_u->hed_u.las_d ) { - fprintf(stderr, "book: walk_init start %llu out of range [%llu, %llu]\r\n", + fprintf(stderr, "book: walk_init start %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", nex_d, txt_u->hed_u.fir_d, txt_u->hed_u.las_d); return c3n; } if ( las_d < nex_d || las_d > txt_u->hed_u.las_d ) { - fprintf(stderr, "book: walk_init end %llu out of range [%llu, %llu]\r\n", + fprintf(stderr, "book: walk_init end %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", las_d, nex_d, txt_u->hed_u.las_d); return c3n; } @@ -745,7 +745,7 @@ u3_book_walk_init(u3_book* txt_u, while ( cur_d < nex_d ) { if ( c3n == _book_skip_deed(txt_u->fid_i, &off_w) ) { - fprintf(stderr, "book: walk_init failed to scan to event %llu\r\n", nex_d); + fprintf(stderr, "book: walk_init failed to scan to event %" PRIu64 "\r\n", nex_d); return c3n; } cur_d++; @@ -784,7 +784,7 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) // read deed into reed if ( c3n == _book_read_deed(itr_u->fid_i, &itr_u->off_w, &red_u) ) { - fprintf(stderr, "book: walk_next failed to read event %llu\r\n", + fprintf(stderr, "book: walk_next failed to read event %" PRIu64 "\r\n", itr_u->nex_d); itr_u->liv_o = c3n; return c3n; @@ -792,7 +792,7 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) // validate reed if ( c3n == _book_okay_reed(&red_u) ) { - fprintf(stderr, "book: walk_next validation failed at event %llu\r\n", + fprintf(stderr, "book: walk_next validation failed at event %" PRIu64 "\r\n", itr_u->nex_d); c3_free(red_u.jam_y); itr_u->liv_o = c3n; diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index 1fa1f38695..d7e7ef093f 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -56,7 +56,7 @@ /* u3_book_deed: complete on-disk event record ** ** NB: not used directly for I/O due to variable-length jam data - ** Actual format: deed_head | jam_data | deed_tail + ** actual format: deed_head | jam_data | deed_tail */ typedef struct _u3_book_deed { u3_book_deed_head hed_u; From 4beab9082bbd057ce9b6b85d8a20bf643e3c5b8f Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 2 Jan 2026 10:50:32 -0500 Subject: [PATCH 05/38] book: adds failure mode tests --- pkg/vere/book_tests.c | 1889 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1863 insertions(+), 26 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index af31c0b275..63be19dd5d 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -99,6 +99,262 @@ _test_verify_event(c3_d eve_d, c3_z siz_z, void* buf_v) return c3y; } +/* corruption test helpers +*/ + +/* _test_get_book_path(): build path to book.log file. +*/ +static void +_test_get_book_path(const c3_c* dir_c, c3_c* path_c, c3_z max_z) +{ + snprintf(path_c, max_z, "%s/book.log", dir_c); +} + +/* _test_get_file_size(): get size of book.log file. +*/ +static c3_o +_test_get_file_size(const c3_c* dir_c, c3_z* siz_z) +{ + c3_c path_c[8193]; + struct stat st; + + _test_get_book_path(dir_c, path_c, sizeof(path_c)); + + if ( 0 != stat(path_c, &st) ) { + fprintf(stderr, "book_tests: stat failed: %s\r\n", path_c); + return c3n; + } + + *siz_z = st.st_size; + return c3y; +} + +/* _test_calculate_event_offset(): calculate byte offset to specific event. +*/ +static c3_o +_test_calculate_event_offset(const c3_c* dir_c, c3_d target_eve, c3_w* off_w) +{ + c3_c path_c[8193]; + c3_i fid_i; + u3_book_head hed_u; + c3_d cur_d; + c3_w cur_off; + c3_zs ret_zs; + + _test_get_book_path(dir_c, path_c, sizeof(path_c)); + + fid_i = c3_open(path_c, O_RDONLY, 0); + if ( 0 > fid_i ) { + fprintf(stderr, "book_tests: open failed: %s\r\n", path_c); + return c3n; + } + + // read header + ret_zs = pread(fid_i, &hed_u, sizeof(u3_book_head), 0); + if ( sizeof(u3_book_head) != ret_zs ) { + fprintf(stderr, "book_tests: header read failed\r\n"); + close(fid_i); + return c3n; + } + + // allow target beyond current range (for corruption tests) + // just scan up to target or last event + c3_d scan_to = (target_eve <= hed_u.las_d) ? target_eve : hed_u.las_d + 1; + + // scan to target event + cur_off = 64 + 256; // sizeof(u3_book_head) + BOOK_META_SIZE + + for ( cur_d = hed_u.fir_d; cur_d < scan_to; cur_d++ ) { + u3_book_deed_head deed_hed; + + ret_zs = pread(fid_i, &deed_hed, sizeof(u3_book_deed_head), cur_off); + if ( sizeof(u3_book_deed_head) != ret_zs ) { + fprintf(stderr, "book_tests: deed header read failed at event %llu offset %u\r\n", + cur_d, cur_off); + close(fid_i); + return c3n; + } + + // total deed size = head(16 with padding) + (len_d - 4) + tail(16 with padding) + // = 16 + (len_d - 4) + 16 = len_d + 28 + cur_off += (deed_hed.len_d + 28); + } + + close(fid_i); + *off_w = cur_off; + return c3y; +} + +/* _test_corrupt_magic(): corrupt magic number in header. +*/ +static c3_o +_test_corrupt_magic(const c3_c* dir_c, c3_w bad_magic) +{ + c3_c path_c[8193]; + c3_i fid_i; + c3_zs ret_zs; + + _test_get_book_path(dir_c, path_c, sizeof(path_c)); + + fid_i = c3_open(path_c, O_RDWR, 0); + if ( 0 > fid_i ) { + fprintf(stderr, "book_tests: corrupt_magic open failed\r\n"); + return c3n; + } + + ret_zs = pwrite(fid_i, &bad_magic, sizeof(c3_w), 0); + if ( sizeof(c3_w) != ret_zs ) { + fprintf(stderr, "book_tests: corrupt_magic write failed\r\n"); + close(fid_i); + return c3n; + } + + c3_sync(fid_i); + close(fid_i); + return c3y; +} + +/* _test_corrupt_version(): corrupt version in header. +*/ +static c3_o +_test_corrupt_version(const c3_c* dir_c, c3_w bad_version) +{ + c3_c path_c[8193]; + c3_i fid_i; + c3_zs ret_zs; + + _test_get_book_path(dir_c, path_c, sizeof(path_c)); + + fid_i = c3_open(path_c, O_RDWR, 0); + if ( 0 > fid_i ) { + fprintf(stderr, "book_tests: corrupt_version open failed\r\n"); + return c3n; + } + + ret_zs = pwrite(fid_i, &bad_version, sizeof(c3_w), 4); // offset 4 + if ( sizeof(c3_w) != ret_zs ) { + fprintf(stderr, "book_tests: corrupt_version write failed\r\n"); + close(fid_i); + return c3n; + } + + c3_sync(fid_i); + close(fid_i); + return c3y; +} + +/* _test_corrupt_event_crc(): corrupt CRC of specific event. +*/ +static c3_o +_test_corrupt_event_crc(const c3_c* dir_c, c3_d eve_d) +{ + c3_c path_c[8193]; + c3_i fid_i; + c3_w event_off, crc_off; + u3_book_deed_head deed_hed; + c3_w bad_crc = 0xDEADBEEF; + c3_zs ret_zs; + + // calculate offset to event + if ( c3n == _test_calculate_event_offset(dir_c, eve_d, &event_off) ) { + return c3n; + } + + _test_get_book_path(dir_c, path_c, sizeof(path_c)); + + fid_i = c3_open(path_c, O_RDWR, 0); + if ( 0 > fid_i ) { + fprintf(stderr, "book_tests: corrupt_event_crc open failed\r\n"); + return c3n; + } + + // read deed header to get len_d + ret_zs = pread(fid_i, &deed_hed, sizeof(u3_book_deed_head), event_off); + if ( sizeof(u3_book_deed_head) != ret_zs ) { + fprintf(stderr, "book_tests: corrupt_event_crc deed read failed\r\n"); + close(fid_i); + return c3n; + } + + // CRC offset = event_off + head(16 with padding) + (len_d - 4) + crc_off = event_off + 16 + (deed_hed.len_d - 4); + + ret_zs = pwrite(fid_i, &bad_crc, sizeof(c3_w), crc_off); + if ( sizeof(c3_w) != ret_zs ) { + fprintf(stderr, "book_tests: corrupt_event_crc write failed\r\n"); + close(fid_i); + return c3n; + } + + c3_sync(fid_i); + close(fid_i); + return c3y; +} + +/* _test_corrupt_event_length_tail(): corrupt let_d in event trailer. +*/ +static c3_o +_test_corrupt_event_length_tail(const c3_c* dir_c, c3_d eve_d, c3_d bad_let_d) +{ + c3_c path_c[8193]; + c3_i fid_i; + c3_w event_off, let_off; + u3_book_deed_head deed_hed; + c3_zs ret_zs; + + // calculate offset to event + if ( c3n == _test_calculate_event_offset(dir_c, eve_d, &event_off) ) { + return c3n; + } + + _test_get_book_path(dir_c, path_c, sizeof(path_c)); + + fid_i = c3_open(path_c, O_RDWR, 0); + if ( 0 > fid_i ) { + fprintf(stderr, "book_tests: corrupt_event_length open failed\r\n"); + return c3n; + } + + // read deed header to get len_d + ret_zs = pread(fid_i, &deed_hed, sizeof(u3_book_deed_head), event_off); + if ( sizeof(u3_book_deed_head) != ret_zs ) { + fprintf(stderr, "book_tests: corrupt_event_length deed read failed\r\n"); + close(fid_i); + return c3n; + } + + // let_d offset = event_off + head(16 with padding) + (len_d - 4) + crc_w(4) + let_off = event_off + 16 + (deed_hed.len_d - 4) + 4; + + ret_zs = pwrite(fid_i, &bad_let_d, sizeof(c3_d), let_off); + if ( sizeof(c3_d) != ret_zs ) { + fprintf(stderr, "book_tests: corrupt_event_length write failed\r\n"); + close(fid_i); + return c3n; + } + + c3_sync(fid_i); + close(fid_i); + return c3y; +} + +/* _test_truncate_file(): truncate book.log to specific offset. +*/ +static c3_o +_test_truncate_file(const c3_c* dir_c, c3_w offset) +{ + c3_c path_c[8193]; + + _test_get_book_path(dir_c, path_c, sizeof(path_c)); + + if ( 0 != truncate(path_c, offset) ) { + fprintf(stderr, "book_tests: truncate failed at offset %u\r\n", offset); + return c3n; + } + + return c3y; +} + /* read callback context */ typedef struct _read_ctx { @@ -803,50 +1059,1631 @@ _test_book_metadata(void) return c3y; } -/* _test_book_core(): run all core book tests. +/* failure mode tests +*/ + +/* _test_book_corrupt_header_magic(): test invalid magic number detection. */ static c3_o -_test_book_core(void) +_test_book_corrupt_header_magic(void) { - c3_o ret = c3y; + c3_c* tmp_c = _test_tmpdir("book-corrupt-magic"); + u3_book* log_u; + void* bufs[10]; + c3_z sizes[10]; + c3_d i; - if ( c3n == _test_book_init_empty() ) { - fprintf(stderr, "book_tests: init_empty failed\r\n"); - ret = c3n; + if ( !tmp_c ) { + return c3n; } - if ( c3n == _test_book_single_event() ) { - fprintf(stderr, "book_tests: single_event failed\r\n"); - ret = c3n; + // create log with 10 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; } - if ( c3n == _test_book_batch_write() ) { - fprintf(stderr, "book_tests: batch_write failed\r\n"); - ret = c3n; + for ( i = 0; i < 10; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); } - if ( c3n == _test_book_persistence() ) { - fprintf(stderr, "book_tests: persistence failed\r\n"); - ret = c3n; + if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: corrupt_header_magic save failed\r\n"); + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; } - if ( c3n == _test_book_contiguity() ) { - fprintf(stderr, "book_tests: contiguity failed\r\n"); - ret = c3n; + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); } - if ( c3n == _test_book_partial_read() ) { - fprintf(stderr, "book_tests: partial_read failed\r\n"); - ret = c3n; + u3_book_exit(log_u); + + // corrupt magic number + if ( c3n == _test_corrupt_magic(tmp_c, 0xDEADBEEF) ) { + fprintf(stderr, "book_tests: corrupt_header_magic corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; } - if ( c3n == _test_book_iterator() ) { - fprintf(stderr, "book_tests: iterator failed\r\n"); - ret = c3n; + // try to reopen - should fail + log_u = u3_book_init(tmp_c); + if ( log_u ) { + fprintf(stderr, "book_tests: corrupt_header_magic should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; } - if ( c3n == _test_book_metadata() ) { - fprintf(stderr, "book_tests: metadata failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_corrupt_header_version(): test unsupported version detection. +*/ +static c3_o +_test_book_corrupt_header_version(void) +{ + c3_c* tmp_c = _test_tmpdir("book-corrupt-version"); + u3_book* log_u; + void* bufs[10]; + c3_z sizes[10]; + c3_d i; + + if ( !tmp_c ) { + return c3n; + } + + // create log with 10 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 10; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: corrupt_header_version save failed\r\n"); + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // corrupt version + if ( c3n == _test_corrupt_version(tmp_c, 99) ) { + fprintf(stderr, "book_tests: corrupt_header_version corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try to reopen - should fail + log_u = u3_book_init(tmp_c); + if ( log_u ) { + fprintf(stderr, "book_tests: corrupt_header_version should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_corrupt_deed_crc(): test CRC corruption detection and recovery. +*/ +static c3_o +_test_book_corrupt_deed_crc(void) +{ + c3_c* tmp_c = _test_tmpdir("book-corrupt-crc"); + u3_book* log_u; + void* bufs[50]; + c3_z sizes[50]; + c3_d i, low_d, hig_d; + read_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + // create log with 50 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: corrupt_deed_crc save failed\r\n"); + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // corrupt event 25's CRC + if ( c3n == _test_corrupt_event_crc(tmp_c, 25) ) { + fprintf(stderr, "book_tests: corrupt_deed_crc corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // reopen - should succeed with recovery + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: corrupt_deed_crc reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify recovery truncated to event 24 + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 24 != hig_d ) { + fprintf(stderr, "book_tests: corrupt_deed_crc gulf wrong: [%llu, %llu] expected [1, 24]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // read events 1-24 should succeed + ctx.expected_start = 1; + ctx.count = 0; + ctx.failed = c3n; + if ( c3n == u3_book_read(log_u, &ctx, 1, 24, _test_read_cb) ) { + fprintf(stderr, "book_tests: corrupt_deed_crc read failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3y == ctx.failed || 24 != ctx.count ) { + fprintf(stderr, "book_tests: corrupt_deed_crc read verify failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_corrupt_deed_length_mismatch(): test len_d != let_d detection. +*/ +static c3_o +_test_book_corrupt_deed_length_mismatch(void) +{ + c3_c* tmp_c = _test_tmpdir("book-corrupt-length"); + u3_book* log_u; + void* bufs[30]; + c3_z sizes[30]; + c3_d i, low_d, hig_d; + + if ( !tmp_c ) { + return c3n; + } + + // create log with 30 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 30; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 30, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: corrupt_deed_length save failed\r\n"); + for ( i = 0; i < 30; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 30; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // corrupt event 15's let_d field + if ( c3n == _test_corrupt_event_length_tail(tmp_c, 15, 99999) ) { + fprintf(stderr, "book_tests: corrupt_deed_length corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // reopen with recovery + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: corrupt_deed_length reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify recovery truncated to event 14 + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 14 != hig_d ) { + fprintf(stderr, "book_tests: corrupt_deed_length gulf wrong: [%llu, %llu] expected [1, 14]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_truncated_deed_partial(): test partial deed detection. +*/ +static c3_o +_test_book_truncated_deed_partial(void) +{ + c3_c* tmp_c = _test_tmpdir("book-truncated"); + u3_book* log_u; + void* bufs[20]; + c3_z sizes[20]; + c3_d i, low_d, hig_d; + c3_w event20_off; + + if ( !tmp_c ) { + return c3n; + } + + // create log with 20 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 20; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 20, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: truncated_deed save failed\r\n"); + for ( i = 0; i < 20; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 20; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // calculate offset to event 20 + if ( c3n == _test_calculate_event_offset(tmp_c, 20, &event20_off) ) { + fprintf(stderr, "book_tests: truncated_deed offset calc failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // truncate in middle of event 20 + if ( c3n == _test_truncate_file(tmp_c, event20_off + 10) ) { + fprintf(stderr, "book_tests: truncated_deed truncate failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // reopen + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: truncated_deed reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify recovery removed partial event 20 + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 19 != hig_d ) { + fprintf(stderr, "book_tests: truncated_deed gulf wrong: [%llu, %llu] expected [1, 19]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_multiple_corruptions(): verify recovery stops at first corruption. +*/ +static c3_o +_test_book_multiple_corruptions(void) +{ + c3_c* tmp_c = _test_tmpdir("book-multi-corrupt"); + u3_book* log_u; + void* bufs[100]; + c3_z sizes[100]; + c3_d i, low_d, hig_d; + + if ( !tmp_c ) { + return c3n; + } + + // create log with 100 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 100; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: multi_corrupt save failed\r\n"); + for ( i = 0; i < 100; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 100; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // corrupt event 30's CRC + if ( c3n == _test_corrupt_event_crc(tmp_c, 30) ) { + fprintf(stderr, "book_tests: multi_corrupt first corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // corrupt event 60's CRC + if ( c3n == _test_corrupt_event_crc(tmp_c, 60) ) { + fprintf(stderr, "book_tests: multi_corrupt second corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // reopen + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: multi_corrupt reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify recovery stopped at first corruption (event 30) + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 29 != hig_d ) { + fprintf(stderr, "book_tests: multi_corrupt gulf wrong: [%llu, %llu] expected [1, 29]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_corrupt_first_event(): corruption at first event empties log. +*/ +static c3_o +_test_book_corrupt_first_event(void) +{ + c3_c* tmp_c = _test_tmpdir("book-corrupt-first"); + u3_book* log_u; + void* bufs[50]; + c3_z sizes[50]; + c3_d i, low_d, hig_d; + + if ( !tmp_c ) { + return c3n; + } + + // create log with 50 events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: corrupt_first save failed\r\n"); + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // corrupt event 1's CRC + if ( c3n == _test_corrupt_event_crc(tmp_c, 1) ) { + fprintf(stderr, "book_tests: corrupt_first corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // reopen + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: corrupt_first reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify log is empty + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 0 != low_d || 0 != hig_d ) { + fprintf(stderr, "book_tests: corrupt_first gulf wrong: [%llu, %llu] expected [0, 0]\r\n", + low_d, hig_d); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_file_too_small(): detect undersized file. +*/ +static c3_o +_test_book_file_too_small(void) +{ + c3_c* tmp_c = _test_tmpdir("book-too-small"); + c3_c path_c[8193]; + c3_i fid_i; + c3_y small_buf[32]; + u3_book* log_u; + + if ( !tmp_c ) { + return c3n; + } + + // manually create small file + _test_get_book_path(tmp_c, path_c, sizeof(path_c)); + + fid_i = c3_open(path_c, O_RDWR|O_CREAT, 0644); + if ( 0 > fid_i ) { + fprintf(stderr, "book_tests: file_too_small create failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + memset(small_buf, 0, sizeof(small_buf)); + if ( sizeof(small_buf) != write(fid_i, small_buf, sizeof(small_buf)) ) { + fprintf(stderr, "book_tests: file_too_small write failed\r\n"); + close(fid_i); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + close(fid_i); + + // try to init - should fail + log_u = u3_book_init(tmp_c); + if ( log_u ) { + fprintf(stderr, "book_tests: file_too_small should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* boundary condition tests +*/ + +/* _test_book_read_empty_log(): test reading from empty log. +*/ +static c3_o +_test_book_read_empty_log(void) +{ + c3_c* tmp_c = _test_tmpdir("book-read-empty"); + u3_book* log_u; + read_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + // create empty log + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try to read from empty log - should fail + ctx.expected_start = 1; + ctx.count = 0; + ctx.failed = c3n; + if ( c3y == u3_book_read(log_u, &ctx, 1, 1, _test_read_cb) ) { + fprintf(stderr, "book_tests: read_empty_log should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_read_beyond_range(): test reading beyond event range. +*/ +static c3_o +_test_book_read_beyond_range(void) +{ + c3_c* tmp_c = _test_tmpdir("book-read-beyond"); + u3_book* log_u; + void* bufs[10]; + c3_z sizes[10]; + c3_d i; + read_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + // create log with events 1-10 + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 10; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: read_beyond_range save failed\r\n"); + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); + } + + // try to read event 11 - should fail + if ( c3y == u3_book_read(log_u, &ctx, 11, 1, _test_read_cb) ) { + fprintf(stderr, "book_tests: read_beyond_range event 11 should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try to read events 5-15 - should fail (extends beyond) + if ( c3y == u3_book_read(log_u, &ctx, 5, 11, _test_read_cb) ) { + fprintf(stderr, "book_tests: read_beyond_range events 5-15 should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try to read event 0 - should fail (before first) + if ( c3y == u3_book_read(log_u, &ctx, 0, 1, _test_read_cb) ) { + fprintf(stderr, "book_tests: read_beyond_range event 0 should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_iterator_invalid_ranges(): test iterator with invalid ranges. +*/ +static c3_o +_test_book_iterator_invalid_ranges(void) +{ + c3_c* tmp_c = _test_tmpdir("book-iter-invalid"); + u3_book* log_u; + u3_book_walk itr_u; + void* bufs[50]; + c3_z sizes[50]; + c3_d i; + + if ( !tmp_c ) { + return c3n; + } + + // create log with events 1-50 + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: iter_invalid_ranges save failed\r\n"); + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + + // try iterator [60, 70] - should fail (beyond range) + if ( c3y == u3_book_walk_init(log_u, &itr_u, 60, 70) ) { + fprintf(stderr, "book_tests: iter_invalid_ranges [60, 70] should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try iterator [40, 30] - should fail (start > end) + if ( c3y == u3_book_walk_init(log_u, &itr_u, 40, 30) ) { + fprintf(stderr, "book_tests: iter_invalid_ranges [40, 30] should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try iterator [0, 10] - should fail (before first) + if ( c3y == u3_book_walk_init(log_u, &itr_u, 0, 10) ) { + fprintf(stderr, "book_tests: iter_invalid_ranges [0, 10] should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_write_first_wrong_epoch(): test first event must be epo_d + 1. +*/ +static c3_o +_test_book_write_first_wrong_epoch(void) +{ + c3_c* tmp_c = _test_tmpdir("book-wrong-epoch"); + u3_book* log_u; + c3_y* buf_y; + c3_z siz_z; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try to save event 5 with epo_d=0 - should fail (expected event 1) + _test_make_event(&buf_y, &siz_z, 5); + if ( c3y == u3_book_save(log_u, 5, 1, (void**)&buf_y, &siz_z, 0) ) { + fprintf(stderr, "book_tests: wrong_epoch event 5 with epo 0 should have failed\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + c3_free(buf_y); + + // try to save event 1 with epo_d=5 - should fail (expected event 6) + _test_make_event(&buf_y, &siz_z, 1); + if ( c3y == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 5) ) { + fprintf(stderr, "book_tests: wrong_epoch event 1 with epo 5 should have failed\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + c3_free(buf_y); + + // save event 1 with epo_d=0 - should succeed + _test_make_event(&buf_y, &siz_z, 1); + if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { + fprintf(stderr, "book_tests: wrong_epoch event 1 with epo 0 failed\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + c3_free(buf_y); + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_large_event_cb(): callback for large event test. +*/ +static c3_o +_test_large_event_cb(void* ptr_v, c3_d eve_d, c3_z siz_z, void* buf_v) +{ + c3_z* expected_size = (c3_z*)ptr_v; + + if ( 1 != eve_d ) { + fprintf(stderr, "book_tests: large_event_cb wrong event: %llu\r\n", eve_d); + return c3n; + } + + if ( *expected_size != siz_z ) { + fprintf(stderr, "book_tests: large_event_cb size mismatch: %zu vs %zu\r\n", + siz_z, *expected_size); + return c3n; + } + + return c3y; +} + +/* _test_book_very_large_event(): test large event handling. +*/ +static c3_o +_test_book_very_large_event(void) +{ + c3_c* tmp_c = _test_tmpdir("book-large-event"); + u3_book* log_u; + c3_y* buf_y; + c3_z siz_z; + c3_z large_size = 1024 * 1024; // 1 MB event + c3_w mug_w = 12345; + c3_z i; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // create large event: 4-byte mug + (large_size - 4) jam data + siz_z = large_size; + buf_y = c3_malloc(siz_z); + + memcpy(buf_y, &mug_w, 4); + for ( i = 4; i < siz_z; i++ ) { + buf_y[i] = (c3_y)(i & 0xff); + } + + // save large event + if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { + fprintf(stderr, "book_tests: very_large_event save failed\r\n"); + c3_free(buf_y); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + c3_free(buf_y); + + // read back and verify size matches + if ( c3n == u3_book_read(log_u, &large_size, 1, 1, _test_large_event_cb) ) { + fprintf(stderr, "book_tests: very_large_event read failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* metadata edge case tests +*/ + +/* _test_book_metadata_section_full(): test 256-byte metadata limit. +*/ +static c3_o +_test_book_metadata_section_full(void) +{ + c3_c* tmp_c = _test_tmpdir("book-meta-full"); + u3_book* log_u; + c3_y data[64]; + c3_i count; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // fill metadata section with entries + // format: [4-byte count][4-byte key_len][key][4-byte val_len][val]... + // total limit: 256 bytes + memset(data, 0xAB, sizeof(data)); + + // add entries until close to limit + for ( count = 0; count < 20; count++ ) { + c3_c key_c[16]; + snprintf(key_c, sizeof(key_c), "key%d", count); + + if ( c3n == u3_book_save_meta(log_u, key_c, sizeof(data), data) ) { + // expected to fail when metadata is full + break; + } + } + + if ( 0 == count ) { + fprintf(stderr, "book_tests: meta_section_full no entries saved\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try to add one more - should fail if we hit the limit + if ( c3y == u3_book_save_meta(log_u, "overflow", sizeof(data), data) ) { + // if it succeeded, we didn't hit the limit yet - that's ok + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_metadata_corrupted_count(): test corrupted metadata handling. +*/ +static c3_o +_test_book_metadata_corrupted_count(void) +{ + c3_c* tmp_c = _test_tmpdir("book-meta-corrupt"); + u3_book* log_u; + c3_w version = 1; + void* bufs[10]; + c3_z sizes[10]; + c3_d i, low_d, hig_d; + c3_c path_c[8193]; + c3_i fid_i; + c3_w bad_count = 999; + + if ( !tmp_c ) { + return c3n; + } + + // create log with metadata and events + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // add metadata + if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { + fprintf(stderr, "book_tests: meta_corrupted save_meta failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // add events + for ( i = 0; i < 10; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: meta_corrupted save failed\r\n"); + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 10; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // corrupt metadata count field (at offset 64) + _test_get_book_path(tmp_c, path_c, sizeof(path_c)); + fid_i = c3_open(path_c, O_RDWR, 0); + if ( 0 > fid_i ) { + fprintf(stderr, "book_tests: meta_corrupted open failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + pwrite(fid_i, &bad_count, sizeof(c3_w), 64); + c3_sync(fid_i); + close(fid_i); + + // reopen - should succeed (metadata corruption shouldn't prevent init) + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: meta_corrupted reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // events should still be readable + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 1 != low_d || 10 != hig_d ) { + fprintf(stderr, "book_tests: meta_corrupted events lost\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_metadata_empty_key(): test empty key edge case. +*/ +static c3_o +_test_book_metadata_empty_key(void) +{ + c3_c* tmp_c = _test_tmpdir("book-meta-empty"); + u3_book* log_u; + c3_w val = 42; + meta_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // try to save with empty key + if ( c3n == u3_book_save_meta(log_u, "", sizeof(val), &val) ) { + // empty key rejected - acceptable behavior + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; + } + + // empty key accepted - try to read it back + u3_book_read_meta(log_u, &ctx, "", _test_meta_cb); + if ( c3n == ctx.found ) { + fprintf(stderr, "book_tests: meta_empty_key not found after save\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_metadata_persistence(): test metadata survives corruption recovery. +*/ +static c3_o +_test_book_metadata_persistence(void) +{ + c3_c* tmp_c = _test_tmpdir("book-meta-persist"); + u3_book* log_u; + c3_w version = 1; + void* bufs[20]; + c3_z sizes[20]; + c3_d i, low_d, hig_d; + meta_ctx ctx = {0}; + + if ( !tmp_c ) { + return c3n; + } + + // create log with metadata + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { + fprintf(stderr, "book_tests: meta_persistence save_meta failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // add events + for ( i = 0; i < 20; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 20, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: meta_persistence save failed\r\n"); + for ( i = 0; i < 20; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 20; i++ ) { + c3_free(bufs[i]); + } + + u3_book_exit(log_u); + + // corrupt last event + if ( c3n == _test_corrupt_event_crc(tmp_c, 20) ) { + fprintf(stderr, "book_tests: meta_persistence corruption failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // reopen with recovery + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + fprintf(stderr, "book_tests: meta_persistence reopen failed\r\n"); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify recovery happened + u3_book_gulf(log_u, &low_d, &hig_d); + if ( 19 != hig_d ) { + fprintf(stderr, "book_tests: meta_persistence recovery didn't happen\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // verify metadata still readable + u3_book_read_meta(log_u, &ctx, "version", _test_meta_cb); + if ( c3n == ctx.found ) { + fprintf(stderr, "book_tests: meta_persistence metadata lost\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* invalid operation tests +*/ + +/* _test_book_null_handle(): test NULL handle checks. +*/ +static c3_o +_test_book_null_handle(void) +{ + c3_d low_d, hig_d; + read_ctx ctx = {0}; + c3_y* buf_y; + c3_z siz_z; + + // test gulf with NULL + if ( c3y == u3_book_gulf(NULL, &low_d, &hig_d) ) { + fprintf(stderr, "book_tests: null_handle gulf should have failed\r\n"); + return c3n; + } + + // test read with NULL + if ( c3y == u3_book_read(NULL, &ctx, 1, 1, _test_read_cb) ) { + fprintf(stderr, "book_tests: null_handle read should have failed\r\n"); + return c3n; + } + + // test save with NULL + _test_make_event(&buf_y, &siz_z, 1); + if ( c3y == u3_book_save(NULL, 1, 1, (void**)&buf_y, &siz_z, 0) ) { + fprintf(stderr, "book_tests: null_handle save should have failed\r\n"); + c3_free(buf_y); + return c3n; + } + c3_free(buf_y); + + return c3y; +} + +/* _test_book_iterator_after_done(): test closed iterator. +*/ +static c3_o +_test_book_iterator_after_done(void) +{ + c3_c* tmp_c = _test_tmpdir("book-iter-done"); + u3_book* log_u; + u3_book_walk itr_u; + void* bufs[20]; + c3_z sizes[20]; + c3_d i; + c3_z len_z; + void* buf_v; + + if ( !tmp_c ) { + return c3n; + } + + // create log with events 1-20 + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 20; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 20, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: iter_after_done save failed\r\n"); + for ( i = 0; i < 20; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 20; i++ ) { + c3_free(bufs[i]); + } + + // create iterator + if ( c3n == u3_book_walk_init(log_u, &itr_u, 1, 20) ) { + fprintf(stderr, "book_tests: iter_after_done walk_init failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // close iterator + u3_book_walk_done(&itr_u); + + // try to use closed iterator - should fail + if ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { + fprintf(stderr, "book_tests: iter_after_done walk_next should have failed\r\n"); + c3_free(buf_v); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_iterator_concurrent_modification(): test iterator after log modification. +*/ +static c3_o +_test_book_iterator_concurrent_modification(void) +{ + c3_c* tmp_c = _test_tmpdir("book-iter-concurrent"); + u3_book* log_u; + u3_book_walk itr_u; + void* bufs[70]; + c3_z sizes[70]; + c3_d i; + c3_z len_z; + void* buf_v; + c3_d count = 0; + + if ( !tmp_c ) { + return c3n; + } + + // create log with events 1-50 + log_u = u3_book_init(tmp_c); + if ( !log_u ) { + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { + fprintf(stderr, "book_tests: iter_concurrent save failed\r\n"); + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 0; i < 50; i++ ) { + c3_free(bufs[i]); + } + + // create iterator for events 10-30 + if ( c3n == u3_book_walk_init(log_u, &itr_u, 10, 30) ) { + fprintf(stderr, "book_tests: iter_concurrent walk_init failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + // read a few events + for ( count = 0; count < 5; count++ ) { + if ( c3n == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { + fprintf(stderr, "book_tests: iter_concurrent walk_next failed\r\n"); + u3_book_walk_done(&itr_u); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + c3_free(buf_v); + } + + // add new events 51-60 + for ( i = 50; i < 60; i++ ) { + _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + } + + if ( c3n == u3_book_save(log_u, 51, 10, &bufs[50], &sizes[50], 0) ) { + fprintf(stderr, "book_tests: iter_concurrent second save failed\r\n"); + for ( i = 50; i < 60; i++ ) { + c3_free(bufs[i]); + } + u3_book_walk_done(&itr_u); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + for ( i = 50; i < 60; i++ ) { + c3_free(bufs[i]); + } + + // continue iterating - should continue with original range + while ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { + c3_free(buf_v); + count++; + } + + // verify we read the expected range (10-30 = 21 events, already read 5) + if ( 21 != count ) { + fprintf(stderr, "book_tests: iter_concurrent count wrong: %llu\r\n", count); + u3_book_walk_done(&itr_u); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; + } + + u3_book_walk_done(&itr_u); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3y; +} + +/* _test_book_core(): run all core book tests. +*/ +static c3_o +_test_book_core(void) +{ + c3_o ret = c3y; + + if ( c3n == _test_book_init_empty() ) { + fprintf(stderr, "book_tests: init_empty failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_single_event() ) { + fprintf(stderr, "book_tests: single_event failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_batch_write() ) { + fprintf(stderr, "book_tests: batch_write failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_persistence() ) { + fprintf(stderr, "book_tests: persistence failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_contiguity() ) { + fprintf(stderr, "book_tests: contiguity failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_partial_read() ) { + fprintf(stderr, "book_tests: partial_read failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_iterator() ) { + fprintf(stderr, "book_tests: iterator failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_metadata() ) { + fprintf(stderr, "book_tests: metadata failed\r\n"); + ret = c3n; + } + + // file corruption tests + if ( c3n == _test_book_corrupt_header_magic() ) { + fprintf(stderr, "book_tests: corrupt_header_magic failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_corrupt_header_version() ) { + fprintf(stderr, "book_tests: corrupt_header_version failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_corrupt_deed_crc() ) { + fprintf(stderr, "book_tests: corrupt_deed_crc failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_corrupt_deed_length_mismatch() ) { + fprintf(stderr, "book_tests: corrupt_deed_length_mismatch failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_truncated_deed_partial() ) { + fprintf(stderr, "book_tests: truncated_deed_partial failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_multiple_corruptions() ) { + fprintf(stderr, "book_tests: multiple_corruptions failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_corrupt_first_event() ) { + fprintf(stderr, "book_tests: corrupt_first_event failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_file_too_small() ) { + fprintf(stderr, "book_tests: file_too_small failed\r\n"); + ret = c3n; + } + + // boundary condition tests + if ( c3n == _test_book_read_empty_log() ) { + fprintf(stderr, "book_tests: read_empty_log failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_read_beyond_range() ) { + fprintf(stderr, "book_tests: read_beyond_range failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_iterator_invalid_ranges() ) { + fprintf(stderr, "book_tests: iterator_invalid_ranges failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_write_first_wrong_epoch() ) { + fprintf(stderr, "book_tests: write_first_wrong_epoch failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_very_large_event() ) { + fprintf(stderr, "book_tests: very_large_event failed\r\n"); + ret = c3n; + } + + // metadata edge case tests + if ( c3n == _test_book_metadata_section_full() ) { + fprintf(stderr, "book_tests: metadata_section_full failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_metadata_corrupted_count() ) { + fprintf(stderr, "book_tests: metadata_corrupted_count failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_metadata_empty_key() ) { + fprintf(stderr, "book_tests: metadata_empty_key failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_metadata_persistence() ) { + fprintf(stderr, "book_tests: metadata_persistence failed\r\n"); + ret = c3n; + } + + // invalid operation tests + if ( c3n == _test_book_null_handle() ) { + fprintf(stderr, "book_tests: null_handle failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_iterator_after_done() ) { + fprintf(stderr, "book_tests: iterator_after_done failed\r\n"); + ret = c3n; + } + + if ( c3n == _test_book_iterator_concurrent_modification() ) { + fprintf(stderr, "book_tests: iterator_concurrent_modification failed\r\n"); ret = c3n; } From 9dc21aef551ca55e37059dfdc9cca26caaf74fd8 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 2 Jan 2026 11:46:19 -0500 Subject: [PATCH 06/38] book: simplifies metadata api --- pkg/vere/book_tests.c | 57 ++--- pkg/vere/db/book.c | 486 ++++++++++++++---------------------------- pkg/vere/db/book.h | 25 ++- pkg/vere/disk.c | 10 +- 4 files changed, 203 insertions(+), 375 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index 63be19dd5d..8792389a46 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -2024,8 +2024,7 @@ _test_book_metadata_section_full(void) { c3_c* tmp_c = _test_tmpdir("book-meta-full"); u3_book* log_u; - c3_y data[64]; - c3_i count; + c3_y data[4]; if ( !tmp_c ) { return c3n; @@ -2038,33 +2037,25 @@ _test_book_metadata_section_full(void) return c3n; } - // fill metadata section with entries - // format: [4-byte count][4-byte key_len][key][4-byte val_len][val]... - // total limit: 256 bytes + // try to save the four fixed keys we support memset(data, 0xAB, sizeof(data)); - - // add entries until close to limit - for ( count = 0; count < 20; count++ ) { - c3_c key_c[16]; - snprintf(key_c, sizeof(key_c), "key%d", count); - - if ( c3n == u3_book_save_meta(log_u, key_c, sizeof(data), data) ) { - // expected to fail when metadata is full - break; - } - } - - if ( 0 == count ) { - fprintf(stderr, "book_tests: meta_section_full no entries saved\r\n"); + + // version (4 bytes) + if ( c3n == u3_book_save_meta(log_u, "version", 4, data) ) { + fprintf(stderr, "book_tests: meta_section_full version save failed\r\n"); u3_book_exit(log_u); _test_cleanup(tmp_c); c3_free(tmp_c); return c3n; } - // try to add one more - should fail if we hit the limit - if ( c3y == u3_book_save_meta(log_u, "overflow", sizeof(data), data) ) { - // if it succeeded, we didn't hit the limit yet - that's ok + // unknown key should fail + if ( c3y == u3_book_save_meta(log_u, "unknown", 4, data) ) { + fprintf(stderr, "book_tests: meta_section_full unknown key should have failed\r\n"); + u3_book_exit(log_u); + _test_cleanup(tmp_c); + c3_free(tmp_c); + return c3n; } u3_book_exit(log_u); @@ -2170,7 +2161,7 @@ _test_book_metadata_corrupted_count(void) return c3y; } -/* _test_book_metadata_empty_key(): test empty key edge case. +/* _test_book_metadata_empty_key(): test unknown key edge case. */ static c3_o _test_book_metadata_empty_key(void) @@ -2178,7 +2169,6 @@ _test_book_metadata_empty_key(void) c3_c* tmp_c = _test_tmpdir("book-meta-empty"); u3_book* log_u; c3_w val = 42; - meta_ctx ctx = {0}; if ( !tmp_c ) { return c3n; @@ -2191,31 +2181,22 @@ _test_book_metadata_empty_key(void) return c3n; } - // try to save with empty key - if ( c3n == u3_book_save_meta(log_u, "", sizeof(val), &val) ) { - // empty key rejected - acceptable behavior + // try to save with unknown key - should fail + if ( c3n == u3_book_save_meta(log_u, "unknown_key", sizeof(val), &val) ) { + // unknown key rejected - expected behavior u3_book_exit(log_u); _test_cleanup(tmp_c); c3_free(tmp_c); return c3y; } - // empty key accepted - try to read it back - u3_book_read_meta(log_u, &ctx, "", _test_meta_cb); - if ( c3n == ctx.found ) { - fprintf(stderr, "book_tests: meta_empty_key not found after save\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - + // unknown key accepted - that's ok, just verify it doesn't crash u3_book_exit(log_u); _test_cleanup(tmp_c); c3_free(tmp_c); return c3y; } - + /* _test_book_metadata_persistence(): test metadata survives corruption recovery. */ static c3_o diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 71d9c99957..caf4a6eb79 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -20,7 +20,7 @@ // // file format: // [64-byte header] -// [metadata section] +// [256-byte metadata section] // [events: len_d | mug_l | jam_data | crc_m | let_d] // @@ -113,11 +113,42 @@ _book_init_head(u3_book* txt_u) txt_u->hed_u.ver_w = BOOK_VERSION; txt_u->hed_u.fir_d = 0; txt_u->hed_u.las_d = 0; - txt_u->hed_u.off_w = 0; - txt_u->hed_u.len_w = 0; + txt_u->hed_u.off_w = sizeof(u3_book_head); + txt_u->hed_u.len_w = BOOK_META_SIZE; txt_u->dit_o = c3y; } +/* _book_init_meta(): initialize metadata section with zeros. +*/ +static c3_o +_book_init_meta(u3_book* txt_u) +{ + u3_book_meta met_u; + c3_zs ret_zs; + + // zero-initialize metadata + memset(&met_u, 0, sizeof(u3_book_meta)); + + // write metadata section at fixed offset + ret_zs = pwrite(txt_u->fid_i, &met_u, sizeof(u3_book_meta), + sizeof(u3_book_head)); + + if ( ret_zs != sizeof(u3_book_meta) ) { + fprintf(stderr, "book: init_meta: failed to write metadata: %s\r\n", + strerror(errno)); + return c3n; + } + + // sync metadata to disk + if ( -1 == c3_sync(txt_u->fid_i) ) { + fprintf(stderr, "book: init_meta: failed to sync metadata: %s\r\n", + strerror(errno)); + return c3n; + } + + return c3y; +} + /* _book_deed_size(): calculate total on-disk size of deed. */ static inline c3_w @@ -348,7 +379,7 @@ _book_scan_end(u3_book* txt_u) if ( cot_d == 0 ) { txt_u->hed_u.fir_d = 0; txt_u->hed_u.las_d = 0; - off_w = sizeof(u3_book_head); + off_w = sizeof(u3_book_head) + BOOK_META_SIZE; } else { txt_u->hed_u.las_d = txt_u->hed_u.fir_d + cot_d - 1; } @@ -406,6 +437,13 @@ u3_book_init(const c3_c* pax_c) // new file: initialize header _book_init_head(txt_u); _book_save_head(txt_u); + // initialize metadata section + if ( c3n == _book_init_meta(txt_u) ) { + close(fid_i); + c3_free(txt_u->pax_c); + c3_free(txt_u); + return 0; + } // events start after header + reserved metadata area txt_u->off_w = sizeof(u3_book_head) + BOOK_META_SIZE; } @@ -521,8 +559,6 @@ u3_book_stat(const c3_c* pax_c) (0 == hed_u.las_d ) ? 0 : (hed_u.las_d - hed_u.fir_d + 1)); fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); - fprintf(stderr, " metadata offset: %u\r\n", hed_u.off_w); - fprintf(stderr, " metadata length: %u\r\n", hed_u.len_w); close(fid_i); } @@ -704,6 +740,124 @@ u3_book_read(u3_book* txt_u, return c3y; } +/* u3_book_read_meta(): read fixed metadata section via callback. +** +** key_c: metadata key +** invokes callback with (ptr_v, len, data) or (ptr_v, -1, 0) if not found. +*/ +void +u3_book_read_meta(u3_book* txt_u, + void* ptr_v, + const c3_c* key_c, + void (*read_f)(void*, c3_zs, void*)) +{ + u3_book_meta met_u; + c3_zs ret_zs; + + if ( !txt_u ) { + read_f(ptr_v, -1, 0); + return; + } + + // read metadata section at fixed offset + ret_zs = pread(txt_u->fid_i, &met_u, sizeof(u3_book_meta), + sizeof(u3_book_head)); + + if ( ret_zs != sizeof(u3_book_meta) ) { + fprintf(stderr, "book: read_meta: failed to read metadata: %s\r\n", + strerror(errno)); + read_f(ptr_v, -1, 0); + return; + } + + // match key and extract corresponding field + if ( 0 == strcmp(key_c, "version") ) { + read_f(ptr_v, sizeof(c3_w), &met_u.ver_w); + } + else if ( 0 == strcmp(key_c, "who") ) { + read_f(ptr_v, sizeof(c3_d[2]), met_u.who_d); + } + else if ( 0 == strcmp(key_c, "fake") ) { + read_f(ptr_v, sizeof(c3_o), &met_u.fak_o); + } + else if ( 0 == strcmp(key_c, "life") ) { + read_f(ptr_v, sizeof(c3_w), &met_u.lif_w); + } + else { + read_f(ptr_v, -1, 0); + } +} + +/* u3_book_save_meta(): write fixed metadata section via callback. +** +** key_c: metadata key +** val_z: value size in bytes +** val_p: pointer to value data +*/ +c3_o +u3_book_save_meta(u3_book* txt_u, + const c3_c* key_c, + c3_z val_z, + void* val_p) +{ + u3_book_meta met_u; + c3_zs ret_zs; + + if ( !txt_u ) { + return c3n; + } + + // read current metadata + ret_zs = pread(txt_u->fid_i, &met_u, sizeof(u3_book_meta), + sizeof(u3_book_head)); + + if ( ret_zs != sizeof(u3_book_meta) ) { + fprintf(stderr, "book: save_meta: failed to read current metadata: %s\r\n", + strerror(errno)); + return c3n; + } + + // update field based on key + if ( 0 == strcmp(key_c, "version") ) { + if ( val_z != sizeof(c3_w) ) return c3n; + memcpy(&met_u.ver_w, val_p, val_z); + } + else if ( 0 == strcmp(key_c, "who") ) { + if ( val_z != sizeof(c3_d[2]) ) return c3n; + memcpy(met_u.who_d, val_p, val_z); + } + else if ( 0 == strcmp(key_c, "fake") ) { + if ( val_z != sizeof(c3_o) ) return c3n; + memcpy(&met_u.fak_o, val_p, val_z); + } + else if ( 0 == strcmp(key_c, "life") ) { + if ( val_z != sizeof(c3_w) ) return c3n; + memcpy(&met_u.lif_w, val_p, val_z); + } + else { + return c3n; + } + + // write metadata section at fixed offset + ret_zs = pwrite(txt_u->fid_i, &met_u, sizeof(u3_book_meta), + sizeof(u3_book_head)); + + if ( ret_zs != sizeof(u3_book_meta) ) { + fprintf(stderr, "book: save_meta: failed to write metadata: %s\r\n", + strerror(errno)); + return c3n; + } + + // sync metadata to disk + if ( -1 == c3_sync(txt_u->fid_i) ) { + fprintf(stderr, "book: save_meta: failed to sync metadata: %s\r\n", + strerror(errno)); + return c3n; + } + + return c3y; +} + /* u3_book_walk_init(): initialize event iterator. ** ** sets up iterator to read events from [nex_d] to [las_d] inclusive. @@ -828,323 +982,3 @@ u3_book_walk_done(u3_book_walk* itr_u) itr_u->liv_o = c3n; itr_u->fid_i = -1; } - -/* u3_book_read_meta(): read metadata by string key from log. -** -** invokes callback with (ptr_v, len, data) or (ptr_v, -1, 0) if not found. -*/ -void -u3_book_read_meta(u3_book* txt_u, - void* ptr_v, - const c3_c* key_c, - void (*read_f)(void*, c3_zs, void*)) -{ - c3_w ken_w; // key length - c3_y* buf_y; // metadata buffer - c3_w len_w; // metadata length - c3_zs ret_zs; - c3_w off_w; - c3_w cot_w; // count - - if ( !txt_u ) { - read_f(ptr_v, -1, 0); - return; - } - - // check if metadata section exists - if ( 0 == txt_u->hed_u.len_w ) { - read_f(ptr_v, -1, 0); - return; - } - - // read entire metadata section - len_w = txt_u->hed_u.len_w; - buf_y = c3_malloc(len_w); - - ret_zs = pread(txt_u->fid_i, buf_y, len_w, txt_u->hed_u.off_w); - if ( ret_zs != (c3_zs)len_w ) { - fprintf(stderr, "book: read_meta: failed to read metadata section\r\n"); - c3_free(buf_y); - read_f(ptr_v, -1, 0); - return; - } - - // parse metadata section - // format: [4 bytes: count] + entries - // entry: [4 bytes: key_len][key][4 bytes: val_len][val] - - if ( len_w < 4 ) { - fprintf(stderr, "book: read_meta: metadata section too small\r\n"); - c3_free(buf_y); - read_f(ptr_v, -1, 0); - return; - } - - memcpy(&cot_w, buf_y, 4); - off_w = 4; - - ken_w = strlen(key_c); - - // linear search for key - for ( c3_w i_w = 0; i_w < cot_w; i_w++ ) { - c3_w entry_key_len; - c3_y* entry_key; - c3_w entry_val_len; - c3_y* entry_val; - - // read key length - if ( off_w + 4 > len_w ) { - fprintf(stderr, "book: read_meta: corrupt metadata (key len)\r\n"); - c3_free(buf_y); - read_f(ptr_v, -1, 0); - return; - } - memcpy(&entry_key_len, buf_y + off_w, 4); - off_w += 4; - - // read key - if ( off_w + entry_key_len > len_w ) { - fprintf(stderr, "book: read_meta: corrupt metadata (key)\r\n"); - c3_free(buf_y); - read_f(ptr_v, -1, 0); - return; - } - entry_key = buf_y + off_w; - off_w += entry_key_len; - - // read value length - if ( off_w + 4 > len_w ) { - fprintf(stderr, "book: read_meta: corrupt metadata (val len)\r\n"); - c3_free(buf_y); - read_f(ptr_v, -1, 0); - return; - } - memcpy(&entry_val_len, buf_y + off_w, 4); - off_w += 4; - - // read value - if ( off_w + entry_val_len > len_w ) { - fprintf(stderr, "book: read_meta: corrupt metadata (val)\r\n"); - c3_free(buf_y); - read_f(ptr_v, -1, 0); - return; - } - entry_val = buf_y + off_w; - off_w += entry_val_len; - - // check if this is the key we're looking for - if ( entry_key_len == ken_w && - 0 == memcmp(entry_key, key_c, ken_w) ) - { - // found it - invoke callback - read_f(ptr_v, entry_val_len, entry_val); - c3_free(buf_y); - return; - } - } - - // not found - c3_free(buf_y); - read_f(ptr_v, -1, 0); -} - -/* u3_book_save_meta(): save metadata by string key into log. -** -** updates or inserts key-value pair in metadata section. -*/ -c3_o -u3_book_save_meta(u3_book* txt_u, - const c3_c* key_c, - c3_z val_z, - void* val_p) -{ - c3_w key_len; - c3_y* old_meta = 0; - c3_w old_len = 0; - c3_w old_count = 0; - c3_y* new_meta; - c3_w new_len; - c3_w new_count; - c3_w offset; - c3_o found = c3n; - c3_zs ret_zs; - - if ( !txt_u ) { - return c3n; - } - - key_len = strlen(key_c); - - // read existing metadata if present - if ( 0 != txt_u->hed_u.len_w ) { - old_len = txt_u->hed_u.len_w; - old_meta = c3_malloc(old_len); - - ret_zs = pread(txt_u->fid_i, old_meta, old_len, txt_u->hed_u.off_w); - if ( ret_zs != (c3_zs)old_len ) { - fprintf(stderr, "book: save_meta: failed to read old metadata\r\n"); - c3_free(old_meta); - return c3n; - } - - if ( old_len < 4 ) { - fprintf(stderr, "book: save_meta: corrupt old metadata\r\n"); - c3_free(old_meta); - return c3n; - } - - memcpy(&old_count, old_meta, 4); - } - - // calculate new metadata size - // worst case: all old entries + new entry - new_len = 4; // count field - - // add existing entries (except if we're updating) - if ( old_meta ) { - offset = 4; - for ( c3_w i_w = 0; i_w < old_count; i_w++ ) { - c3_w entry_key_len, entry_val_len; - - if ( offset + 4 > old_len ) break; - memcpy(&entry_key_len, old_meta + offset, 4); - offset += 4; - - if ( offset + entry_key_len > old_len ) break; - - // check if this is the key we're updating - if ( entry_key_len == key_len && - 0 == memcmp(old_meta + offset, key_c, key_len) ) - { - found = c3y; - // skip old value, we'll add new one - offset += entry_key_len; - if ( offset + 4 > old_len ) break; - memcpy(&entry_val_len, old_meta + offset, 4); - offset += 4 + entry_val_len; - continue; - } - - // add this entry to new size - offset += entry_key_len; - if ( offset + 4 > old_len ) break; - memcpy(&entry_val_len, old_meta + offset, 4); - offset += 4; - - new_len += 4 + entry_key_len + 4 + entry_val_len; - offset += entry_val_len; - } - } - - // add new/updated entry - new_len += 4 + key_len + 4 + val_z; - - // allocate new metadata buffer - new_meta = c3_malloc(new_len); - - // write count - new_count = (c3y == found) ? old_count : old_count + 1; - memcpy(new_meta, &new_count, 4); - offset = 4; - - // copy existing entries (except updated one) - if ( old_meta ) { - c3_w old_offset = 4; - for ( c3_w i_w = 0; i_w < old_count; i_w++ ) { - c3_w entry_key_len, entry_val_len; - - if ( old_offset + 4 > old_len ) break; - memcpy(&entry_key_len, old_meta + old_offset, 4); - - if ( old_offset + 4 + entry_key_len > old_len ) break; - - // skip if this is the key we're updating - if ( entry_key_len == key_len && - 0 == memcmp(old_meta + old_offset + 4, key_c, key_len) ) - { - old_offset += 4 + entry_key_len; - if ( old_offset + 4 > old_len ) break; - memcpy(&entry_val_len, old_meta + old_offset, 4); - old_offset += 4 + entry_val_len; - continue; - } - - // copy this entry - memcpy(new_meta + offset, old_meta + old_offset, 4); - offset += 4; - old_offset += 4; - - memcpy(new_meta + offset, old_meta + old_offset, entry_key_len); - offset += entry_key_len; - old_offset += entry_key_len; - - if ( old_offset + 4 > old_len ) break; - memcpy(&entry_val_len, old_meta + old_offset, 4); - memcpy(new_meta + offset, &entry_val_len, 4); - offset += 4; - old_offset += 4; - - memcpy(new_meta + offset, old_meta + old_offset, entry_val_len); - offset += entry_val_len; - old_offset += entry_val_len; - } - } - - // add new/updated entry - memcpy(new_meta + offset, &key_len, 4); - offset += 4; - memcpy(new_meta + offset, key_c, key_len); - offset += key_len; - { - c3_w val_len_w = (c3_w)val_z; // convert c3_z to c3_w for 4-byte field - memcpy(new_meta + offset, &val_len_w, 4); - } - offset += 4; - memcpy(new_meta + offset, val_p, val_z); - offset += val_z; - - // write new metadata section in reserved area after header - c3_w new_off = sizeof(u3_book_head); - - // ensure metadata fits in reserved space - if ( new_len > BOOK_META_SIZE ) { - fprintf(stderr, "book: save_meta: metadata too large (%u > %u)\r\n", - new_len, BOOK_META_SIZE); - c3_free(new_meta); - if ( old_meta ) c3_free(old_meta); - return c3n; - } - - ret_zs = pwrite(txt_u->fid_i, new_meta, new_len, new_off); - if ( ret_zs != (c3_zs)new_len ) { - fprintf(stderr, "book: save_meta: failed to write metadata: %s\r\n", - strerror(errno)); - c3_free(new_meta); - if ( old_meta ) c3_free(old_meta); - return c3n; - } - - c3_free(new_meta); - if ( old_meta ) c3_free(old_meta); - - // sync metadata - if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: save_meta: failed to sync metadata: %s\r\n", - strerror(errno)); - return c3n; - } - - // update header - txt_u->hed_u.off_w = new_off; - txt_u->hed_u.len_w = new_len; - txt_u->dit_o = c3y; - - // write and sync header - if ( c3n == _book_save_head(txt_u) ) { - return c3n; - } - - // off_w is not affected by metadata writes - events append at off_w - - return c3y; -} diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index d7e7ef093f..72fafcdbb9 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -15,10 +15,29 @@ c3_d fir_d; // first event number in file c3_d las_d; // last event number in file c3_w off_w; // offset to metadata section - c3_w len_w; // length of metadata section + c3_w len_w; // length of metadata section (reserved, currently unused) c3_y pad_y[32]; // reserved for future use, zeroed } u3_book_head; + /* u3_book_meta: on-disk metadata format (fixed 256 bytes) + ** + ** layout: + ** [4 bytes] version + ** [16 bytes] who_d (c3_d[2], identity) + ** [1 byte] fak_o (fake security bit) + ** [4 bytes] lif_w (lifecycle length) + ** [231 bytes] reserved for future use + ** + ** total: 256 bytes + */ + typedef struct _u3_book_meta { + c3_w ver_w; // metadata format version + c3_d who_d[2]; // ship identity (16 bytes) + c3_o fak_o; // fake security flag (1 byte) + c3_w lif_w; // lifecycle length (4 bytes) + c3_y pad_y[231]; // reserved (231 bytes) + } u3_book_meta; + /* u3_book: event log handle */ typedef struct _u3_book { @@ -112,7 +131,7 @@ c3_z* siz_i, c3_d epo_d); - /* u3_book_read_meta(): read metadata by string key from log. + /* u3_book_read_meta(): read fixed metadata section. */ void u3_book_read_meta(u3_book* txt_u, @@ -120,7 +139,7 @@ const c3_c* key_c, void (*read_f)(void*, c3_zs, void*)); - /* u3_book_save_meta(): save metadata by string key into log. + /* u3_book_save_meta(): write fixed metadata section. */ c3_o u3_book_save_meta(u3_book* txt_u, diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index ec6c0dc0d8..691d9a6c70 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -466,12 +466,6 @@ u3_disk_walk_done(u3_disk_walk* wok_u) static c3_o _disk_save_meta(u3_book* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) { - // strip trailing zeroes. - // - while ( len_w && !byt_y[len_w - 1] ) { - len_w--; - } - return u3_book_save_meta(mdb_u, key_c, len_w, byt_y); } @@ -485,7 +479,7 @@ u3_disk_save_meta(u3_book* mdb_u, const u3_meta* met_u) u3_noun who = u3i_chubs(2, met_u->who_d); if ( (c3n == _disk_save_meta(mdb_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) - || (c3n == _disk_save_meta(mdb_u, "who", 2 * sizeof(c3_d), (c3_y*)met_u->who_d)) + || (c3n == _disk_save_meta(mdb_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) || (c3n == _disk_save_meta(mdb_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) || (c3n == _disk_save_meta(mdb_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) { @@ -1694,7 +1688,7 @@ _disk_migrate_old(u3_disk* log_u) // set version to 2 (migration in progress) log_u->ver_w = U3D_VER2; - if ( c3n == _disk_save_meta(log_u->mdb_u, "version", 4, (c3_y*)&log_u->ver_w) ) { + if ( c3n == _disk_save_meta(log_u->mdb_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { fprintf(stderr, "disk: failed to set version to 2\r\n"); exit(1); } From c4443f96739159e50866e8206d93d4b16bcf49b0 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 2 Jan 2026 12:11:29 -0500 Subject: [PATCH 07/38] book: fixes contiguity validation printf --- pkg/vere/db/book.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index caf4a6eb79..9dc38123ae 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -586,7 +586,9 @@ u3_book_save(u3_book* txt_u, if ( 0 == txt_u->hed_u.las_d ) { // empty log: first event must be the first event in the epoch if ( epo_d + 1 != eve_d ) { - fprintf(stderr, "book: first event must be 1, got %" PRIu64 "\r\n", eve_d); + fprintf(stderr, "book: first event must be start of epoch, " + "expected %" PRIu64 ", got %" PRIu64 + "\r\n", epo_d + 1, eve_d); return c3n; } txt_u->hed_u.fir_d = eve_d; From 9e1cb8d56468894faee4b547d5a74594db224e61 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 2 Jan 2026 12:12:40 -0500 Subject: [PATCH 08/38] book: runs `book-test` in ci --- .github/workflows/shared.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/shared.yml b/.github/workflows/shared.yml index e13506e239..76cc550d16 100644 --- a/.github/workflows/shared.yml +++ b/.github/workflows/shared.yml @@ -100,6 +100,7 @@ jobs: pact-test equality-test \ boot-test newt-test \ vere-noun-test unix-test \ + book-test \ benchmarks \ -Doptimize=ReleaseFast \ -Dpace=${{inputs.pace}} \ From 59ae845d7584e28804c953ef2d19b0e5d272dead Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 2 Jan 2026 12:37:52 -0500 Subject: [PATCH 09/38] disk: renames `mdb_u` to `txt_u` --- pkg/vere/disk.c | 90 ++++++++++++++++++++++++------------------------- pkg/vere/main.c | 2 +- pkg/vere/mars.c | 6 ++-- pkg/vere/vere.h | 6 ++-- 4 files changed, 52 insertions(+), 52 deletions(-) diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index 691d9a6c70..3d58e1f963 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -92,7 +92,7 @@ _disk_commit_cb(uv_work_t* ted_u) { u3_disk* log_u = ted_u->data; - log_u->sav_u.ret_o = u3_book_save(log_u->mdb_u, + log_u->sav_u.ret_o = u3_book_save(log_u->txt_u, log_u->sav_u.eve_d, log_u->sav_u.len_w, (void**)log_u->sav_u.byt_y, @@ -274,7 +274,7 @@ u3_disk_sync(u3_disk* log_u) // XX max 100 // if ( c3y == _disk_batch(log_u) ) { - ret_o = u3_book_save(log_u->mdb_u, + ret_o = u3_book_save(log_u->txt_u, log_u->sav_u.eve_d, log_u->sav_u.len_w, (void**)log_u->sav_u.byt_y, @@ -375,7 +375,7 @@ u3_disk_read_list(u3_disk* log_u, c3_d eve_d, c3_d len_d, c3_l* mug_l) { struct _cd_list ven_u = { log_u, u3_nul, 0 }; - if ( c3n == u3_book_read(log_u->mdb_u, &ven_u, + if ( c3n == u3_book_read(log_u->txt_u, &ven_u, eve_d, len_d, _disk_read_list_cb) ) { // XX test normal (not subcommand) replay with and without, @@ -399,7 +399,7 @@ u3_disk_walk_init(u3_disk* log_u, c3_d max_d = eve_d + len_d - 1; wok_u->log_u = log_u; - wok_u->liv_o = u3_book_walk_init(log_u->mdb_u, + wok_u->liv_o = u3_book_walk_init(log_u->txt_u, &wok_u->itr_u, eve_d, c3_min(max_d, log_u->dun_d)); @@ -464,24 +464,24 @@ u3_disk_walk_done(u3_disk_walk* wok_u) /* _disk_save_meta(): serialize atom, save as metadata at [key_c]. */ static c3_o -_disk_save_meta(u3_book* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) +_disk_save_meta(u3_book* txt_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) { - return u3_book_save_meta(mdb_u, key_c, len_w, byt_y); + return u3_book_save_meta(txt_u, key_c, len_w, byt_y); } /* u3_disk_save_meta(): save metadata. */ c3_o -u3_disk_save_meta(u3_book* mdb_u, const u3_meta* met_u) +u3_disk_save_meta(u3_book* txt_u, const u3_meta* met_u) { u3_assert( c3y == u3a_is_cat(met_u->lif_w) ); u3_noun who = u3i_chubs(2, met_u->who_d); - if ( (c3n == _disk_save_meta(mdb_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) - || (c3n == _disk_save_meta(mdb_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) - || (c3n == _disk_save_meta(mdb_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) - || (c3n == _disk_save_meta(mdb_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) + if ( (c3n == _disk_save_meta(txt_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) + || (c3n == _disk_save_meta(txt_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) + || (c3n == _disk_save_meta(txt_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) + || (c3n == _disk_save_meta(txt_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) { u3z(who); return c3n; @@ -540,7 +540,7 @@ _disk_meta_read_cb(void* ptr_v, ssize_t val_i, void* val_v) /* u3_disk_read_meta(): read metadata. */ c3_o -u3_disk_read_meta(u3_book* mdb_u, u3_meta* met_u) +u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u) { c3_w ver_w, lif_w; c3_d who_d[2]; @@ -550,13 +550,13 @@ u3_disk_read_meta(u3_book* mdb_u, u3_meta* met_u) // version // - u3_book_read_meta(mdb_u, &val_u, "version", _disk_meta_read_cb); + u3_book_read_meta(txt_u, &val_u, "version", _disk_meta_read_cb); ver_w = val_u.buf_y[0]; // identity // - u3_book_read_meta(mdb_u, &val_u, "who", _disk_meta_read_cb); + u3_book_read_meta(txt_u, &val_u, "who", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no identity\r\n"); @@ -591,7 +591,7 @@ u3_disk_read_meta(u3_book* mdb_u, u3_meta* met_u) // fake bit // - u3_book_read_meta(mdb_u, &val_u, "fake", _disk_meta_read_cb); + u3_book_read_meta(txt_u, &val_u, "fake", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no fake bit\r\n"); @@ -611,7 +611,7 @@ u3_disk_read_meta(u3_book* mdb_u, u3_meta* met_u) // life // - u3_book_read_meta(mdb_u, &val_u, "life", _disk_meta_read_cb); + u3_book_read_meta(txt_u, &val_u, "life", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no lifecycle length\r\n"); @@ -825,7 +825,7 @@ u3_disk_exit(u3_disk* log_u) // close database // - u3_book_exit(log_u->mdb_u); + u3_book_exit(log_u->txt_u); // dispose planned writes // @@ -1145,15 +1145,15 @@ _disk_epoc_roll(u3_disk* log_u, c3_d epo_d) // get metadata from old log, update version u3_meta old_u; - if ( c3y != u3_disk_read_meta(log_u->mdb_u, &old_u) ) { + if ( c3y != u3_disk_read_meta(log_u->txt_u, &old_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); goto fail3; } - u3_book_exit(log_u->mdb_u); - log_u->mdb_u = 0; + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; // initialize db of new epoch - if ( 0 == (log_u->mdb_u = u3_book_init(epo_c)) ) { + if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database\r\n"); c3_free(log_u); goto fail3; @@ -1161,7 +1161,7 @@ _disk_epoc_roll(u3_disk* log_u, c3_d epo_d) // write the metadata to the database old_u.ver_w = U3D_VERLAT; - if ( c3n == u3_disk_save_meta(log_u->mdb_u, &old_u) ) { + if ( c3n == u3_disk_save_meta(log_u->txt_u, &old_u) ) { fprintf(stderr, "disk: failed to save metadata\r\n"); goto fail3; } @@ -1335,19 +1335,19 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) * 6. open epoch lmdb and set it in log_u */ - // NB: requires that log_u->mdb_u is initialized to log/data.mdb + // NB: requires that log_u->txt_u is initialized to log/data.mdb // XX: put old log in separate pointer (old_u?)? // get metadata from old log, update version u3_meta olm_u; - if ( c3y != u3_disk_read_meta(log_u->mdb_u, &olm_u) ) { + if ( c3y != u3_disk_read_meta(log_u->txt_u, &olm_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); return c3n; } // finish with old log - u3_book_exit(log_u->mdb_u); - log_u->mdb_u = 0; + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; // check if lock.mdb is readable in log directory c3_o luk_o = c3n; @@ -1414,22 +1414,22 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) return c3n; } - if ( 0 == (log_u->mdb_u = u3_book_init(tmp_c)) ) { + if ( 0 == (log_u->txt_u = u3_book_init(tmp_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", tmp_c); return c3n; } olm_u.ver_w = U3D_VERLAT; - if ( c3n == u3_disk_save_meta(log_u->mdb_u, &olm_u) ) { + if ( c3n == u3_disk_save_meta(log_u->txt_u, &olm_u) ) { fprintf(stderr, "disk: failed to save metadata\r\n"); return c3n; } // atomic truncation of old log // - u3_book_exit(log_u->mdb_u); - log_u->mdb_u = 0; + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; c3_c trd_c[8193]; snprintf(trd_c, sizeof(trd_c), "%s/data.mdb", tmp_c); @@ -1449,7 +1449,7 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) strerror(errno)); } - if ( 0 == (log_u->mdb_u = u3_book_init(epo_c)) ) { + if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", epo_c); return c3n; @@ -1525,7 +1525,7 @@ u3_disk_roll(u3_disk* log_u, c3_d eve_d) // XX get fir_d from log_u c3_d fir_d, las_d; - if ( c3n == u3_book_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { fprintf(stderr, "roll: failed to read first/last event numbers\r\n"); exit(1); } @@ -1675,7 +1675,7 @@ static void _disk_migrate_old(u3_disk* log_u) { c3_d fir_d, las_d; - if ( c3n == u3_book_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); exit(1); } @@ -1688,7 +1688,7 @@ _disk_migrate_old(u3_disk* log_u) // set version to 2 (migration in progress) log_u->ver_w = U3D_VER2; - if ( c3n == _disk_save_meta(log_u->mdb_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { + if ( c3n == _disk_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { fprintf(stderr, "disk: failed to set version to 2\r\n"); exit(1); } @@ -1765,7 +1765,7 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) snprintf(epo_c, 8192, "%s/0i%" PRIc3_d, log_u->com_u->pax_c, lat_d); // initialize latest epoch's db - if ( 0 == (log_u->mdb_u = u3_book_init(epo_c)) ) { + if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", epo_c); return _epoc_fail; @@ -1775,20 +1775,20 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) // get first/last event numbers from book c3_d fir_d, las_d; - if ( c3n == u3_book_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); - u3_book_exit(log_u->mdb_u); - log_u->mdb_u = 0; + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; return _epoc_fail; } if ( (u3_dlod_boot != lod_e) && !fir_d && !las_d - && (c3n == u3_disk_read_meta(log_u->mdb_u, 0)) ) + && (c3n == u3_disk_read_meta(log_u->txt_u, 0)) ) { - u3_book_exit(log_u->mdb_u); - log_u->mdb_u = 0; + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; return _epoc_void; } @@ -2039,8 +2039,8 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) // { u3_meta met_u; - if ( (0 == (log_u->mdb_u = u3_book_init(log_c))) - || (c3n == u3_disk_read_meta(log_u->mdb_u, &met_u)) ) + if ( (0 == (log_u->txt_u = u3_book_init(log_c))) + || (c3n == u3_disk_read_meta(log_u->txt_u, &met_u)) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); c3_free(log_u); // XX leaks dire(s) @@ -2065,8 +2065,8 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) } // close top-level book - u3_book_exit(log_u->mdb_u); - log_u->mdb_u = 0; + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; // get latest epoch number c3_d lat_d; diff --git a/pkg/vere/main.c b/pkg/vere/main.c index e1b3512172..0aad3c3380 100644 --- a/pkg/vere/main.c +++ b/pkg/vere/main.c @@ -1506,7 +1506,7 @@ _cw_info(c3_i argc, c3_c* argv[]) fprintf(stderr, "\r\n"); } - u3_book_stat(log_u->mdb_u->pax_c); + u3_book_stat(log_u->txt_u->pax_c); u3_disk_exit(log_u); u3m_stop(); diff --git a/pkg/vere/mars.c b/pkg/vere/mars.c index 949369d88e..af5b65373a 100644 --- a/pkg/vere/mars.c +++ b/pkg/vere/mars.c @@ -1295,7 +1295,7 @@ u3_mars_play(u3_mars* mar_u, c3_d eve_d, c3_d sap_d) if ( !mar_u->dun_d ) { u3_meta met_u; - if ( c3n == u3_disk_read_meta(log_u->mdb_u, &met_u) ) { + if ( c3n == u3_disk_read_meta(log_u->txt_u, &met_u) ) { fprintf(stderr, "mars: disk read meta fail\r\n"); // XX exit code, cb // @@ -1451,7 +1451,7 @@ u3_mars_load(u3_mars* mar_u, u3_disk_load_e lod_e) mar_u->sen_d = mar_u->dun_d = u3A->eve_d; mar_u->mug_l = u3r_mug(u3A->roc); - if ( c3n == u3_disk_read_meta(mar_u->log_u->mdb_u, &(mar_u->met_u)) ) { + if ( c3n == u3_disk_read_meta(mar_u->log_u->txt_u, &(mar_u->met_u)) ) { fprintf(stderr, "mars: disk meta fail\r\n"); u3_disk_exit(mar_u->log_u); exit(1); // XX @@ -1949,7 +1949,7 @@ u3_mars_boot(u3_mars* mar_u, c3_d len_d, c3_y* hun_y) exit(1); // XX cleanup } - if ( c3n == u3_disk_save_meta(log_u->mdb_u, &met_u) ) { + if ( c3n == u3_disk_save_meta(log_u->txt_u, &met_u) ) { exit(1); // XX cleanup } diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index a793e9aee4..cd4460aef5 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -533,7 +533,7 @@ c3_i lok_i; // lockfile c3_o liv_o; // live c3_w ver_w; // version (see version.h) - u3_book* mdb_u; // book env of current epoch + u3_book* txt_u; // book env of current epoch c3_d sen_d; // commit requested c3_d dun_d; // committed c3_d epo_d; // current epoch number @@ -878,12 +878,12 @@ /* u3_disk_read_meta(): read metadata. */ c3_o - u3_disk_read_meta(u3_book* mdb_u, u3_meta* met_u); + u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u); /* u3_disk_save_meta(): save metadata. */ c3_o - u3_disk_save_meta(u3_book* mdb_u, const u3_meta* met_u); + u3_disk_save_meta(u3_book* txt_u, const u3_meta* met_u); /* u3_disk_save_meta_meta(): save meta metadata. */ From f3210b3799daf11b0c609c5030f3fde82b7288c4 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 2 Jan 2026 12:40:24 -0500 Subject: [PATCH 10/38] book: uses `PRIu64` format specifier instead of `llu` in tests --- pkg/vere/book_tests.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index 8792389a46..06a8a8e888 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -75,14 +75,14 @@ _test_verify_event(c3_d eve_d, c3_z siz_z, void* buf_v) memcpy(&mug_w, buf_y, 4); if ( mug_w != (c3_w)eve_d ) { - fprintf(stderr, "book_tests: event %llu mug mismatch: got %u\r\n", eve_d, mug_w); + fprintf(stderr, "book_tests: event %" PRIu64 " mug mismatch: got %u\r\n", eve_d, mug_w); return c3n; } expected_len = 16 + (eve_d % 32); if ( siz_z != 4 + expected_len ) { - fprintf(stderr, "book_tests: event %llu size mismatch: got %zu, expected %zu (4 + %zu)\r\n", + fprintf(stderr, "book_tests: event %" PRIu64 " size mismatch: got %zu, expected %zu (4 + %zu)\r\n", eve_d, siz_z, 4 + expected_len, expected_len); return c3n; } @@ -90,7 +90,7 @@ _test_verify_event(c3_d eve_d, c3_z siz_z, void* buf_v) // verify jam data pattern for ( c3_z i = 0; i < expected_len; i++ ) { if ( buf_y[4 + i] != (c3_y)((eve_d + i) & 0xff) ) { - fprintf(stderr, "book_tests: event %llu data mismatch at offset %zu\r\n", + fprintf(stderr, "book_tests: event %" PRIu64 " data mismatch at offset %zu\r\n", eve_d, i); return c3n; } @@ -169,7 +169,7 @@ _test_calculate_event_offset(const c3_c* dir_c, c3_d target_eve, c3_w* off_w) ret_zs = pread(fid_i, &deed_hed, sizeof(u3_book_deed_head), cur_off); if ( sizeof(u3_book_deed_head) != ret_zs ) { - fprintf(stderr, "book_tests: deed header read failed at event %llu offset %u\r\n", + fprintf(stderr, "book_tests: deed header read failed at event %" PRIu64 " offset %u\r\n", cur_d, cur_off); close(fid_i); return c3n; @@ -371,7 +371,7 @@ _test_read_cb(void* ptr_v, c3_d eve_d, c3_z siz_z, void* buf_v) read_ctx* ctx = (read_ctx*)ptr_v; if ( eve_d != ctx->expected_start + ctx->count ) { - fprintf(stderr, "book_tests: read callback event mismatch: %llu vs %llu\r\n", + fprintf(stderr, "book_tests: read callback event mismatch: %" PRIu64 " vs %" PRIu64 "\r\n", eve_d, ctx->expected_start + ctx->count); ctx->failed = c3y; return c3n; @@ -421,7 +421,7 @@ _test_book_init_empty(void) } if ( 0 != low_d || 0 != hig_d ) { - fprintf(stderr, "book_tests: empty gulf wrong: [%llu, %llu]\r\n", + fprintf(stderr, "book_tests: empty gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -475,7 +475,7 @@ _test_book_single_event(void) // verify gulf u3_book_gulf(log_u, &low_d, &hig_d); if ( 1 != low_d || 1 != hig_d ) { - fprintf(stderr, "book_tests: single gulf wrong: [%llu, %llu]\r\n", + fprintf(stderr, "book_tests: single gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -496,7 +496,7 @@ _test_book_single_event(void) } if ( c3y == ctx.failed || 1 != ctx.count ) { - fprintf(stderr, "book_tests: read verify failed (failed=%u, count=%llu)\r\n", + fprintf(stderr, "book_tests: read verify failed (failed=%u, count=%" PRIu64 ")\r\n", ctx.failed, ctx.count); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -558,7 +558,7 @@ _test_book_batch_write(void) // verify gulf u3_book_gulf(log_u, &low_d, &hig_d); if ( 1 != low_d || 100 != hig_d ) { - fprintf(stderr, "book_tests: batch gulf wrong: [%llu, %llu]\r\n", + fprintf(stderr, "book_tests: batch gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -579,7 +579,7 @@ _test_book_batch_write(void) } if ( c3y == ctx.failed || 100 != ctx.count ) { - fprintf(stderr, "book_tests: batch read verify failed (failed=%u, count=%llu)\r\n", + fprintf(stderr, "book_tests: batch read verify failed (failed=%u, count=%" PRIu64 ")\r\n", ctx.failed, ctx.count); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -649,7 +649,7 @@ _test_book_persistence(void) u3_book_gulf(log_u, &low_d, &hig_d); if ( 1 != low_d || 50 != hig_d ) { - fprintf(stderr, "book_tests: persist gulf wrong: [%llu, %llu]\r\n", + fprintf(stderr, "book_tests: persist gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -802,7 +802,7 @@ _test_book_partial_read(void) } if ( c3y == ctx.failed || 26 != ctx.count ) { - fprintf(stderr, "book_tests: partial verify failed: count=%llu\r\n", + fprintf(stderr, "book_tests: partial verify failed: count=%" PRIu64 "\r\n", ctx.count); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -875,7 +875,7 @@ _test_book_iterator(void) c3_d expected_eve = 10 + count; if ( c3n == _test_verify_event(expected_eve, len_z, buf_v) ) { - fprintf(stderr, "book_tests: iter verify failed at %llu\r\n", count); + fprintf(stderr, "book_tests: iter verify failed at %" PRIu64 "\r\n", count); c3_free(buf_v); u3_book_walk_done(&itr_u); u3_book_exit(log_u); @@ -889,7 +889,7 @@ _test_book_iterator(void) } if ( 21 != count ) { // events 10-30 inclusive = 21 events - fprintf(stderr, "book_tests: iter count wrong: %llu\r\n", count); + fprintf(stderr, "book_tests: iter count wrong: %" PRIu64 "\r\n", count); u3_book_walk_done(&itr_u); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -1261,7 +1261,7 @@ _test_book_corrupt_deed_crc(void) // verify recovery truncated to event 24 u3_book_gulf(log_u, &low_d, &hig_d); if ( 1 != low_d || 24 != hig_d ) { - fprintf(stderr, "book_tests: corrupt_deed_crc gulf wrong: [%llu, %llu] expected [1, 24]\r\n", + fprintf(stderr, "book_tests: corrupt_deed_crc gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 24]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -1359,7 +1359,7 @@ _test_book_corrupt_deed_length_mismatch(void) // verify recovery truncated to event 14 u3_book_gulf(log_u, &low_d, &hig_d); if ( 1 != low_d || 14 != hig_d ) { - fprintf(stderr, "book_tests: corrupt_deed_length gulf wrong: [%llu, %llu] expected [1, 14]\r\n", + fprintf(stderr, "book_tests: corrupt_deed_length gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 14]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -1446,7 +1446,7 @@ _test_book_truncated_deed_partial(void) // verify recovery removed partial event 20 u3_book_gulf(log_u, &low_d, &hig_d); if ( 1 != low_d || 19 != hig_d ) { - fprintf(stderr, "book_tests: truncated_deed gulf wrong: [%llu, %llu] expected [1, 19]\r\n", + fprintf(stderr, "book_tests: truncated_deed gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 19]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -1532,7 +1532,7 @@ _test_book_multiple_corruptions(void) // verify recovery stopped at first corruption (event 30) u3_book_gulf(log_u, &low_d, &hig_d); if ( 1 != low_d || 29 != hig_d ) { - fprintf(stderr, "book_tests: multi_corrupt gulf wrong: [%llu, %llu] expected [1, 29]\r\n", + fprintf(stderr, "book_tests: multi_corrupt gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 29]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -1610,7 +1610,7 @@ _test_book_corrupt_first_event(void) // verify log is empty u3_book_gulf(log_u, &low_d, &hig_d); if ( 0 != low_d || 0 != hig_d ) { - fprintf(stderr, "book_tests: corrupt_first gulf wrong: [%llu, %llu] expected [0, 0]\r\n", + fprintf(stderr, "book_tests: corrupt_first gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [0, 0]\r\n", low_d, hig_d); u3_book_exit(log_u); _test_cleanup(tmp_c); @@ -1941,7 +1941,7 @@ _test_large_event_cb(void* ptr_v, c3_d eve_d, c3_z siz_z, void* buf_v) c3_z* expected_size = (c3_z*)ptr_v; if ( 1 != eve_d ) { - fprintf(stderr, "book_tests: large_event_cb wrong event: %llu\r\n", eve_d); + fprintf(stderr, "book_tests: large_event_cb wrong event: %" PRIu64 "\r\n", eve_d); return c3n; } @@ -2502,7 +2502,7 @@ _test_book_iterator_concurrent_modification(void) // verify we read the expected range (10-30 = 21 events, already read 5) if ( 21 != count ) { - fprintf(stderr, "book_tests: iter_concurrent count wrong: %llu\r\n", count); + fprintf(stderr, "book_tests: iter_concurrent count wrong: %" PRIu64 "\r\n", count); u3_book_walk_done(&itr_u); u3_book_exit(log_u); _test_cleanup(tmp_c); From 20b05a7cd3b05422c7bcac2d525147d6163edae9 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Tue, 13 Jan 2026 11:33:26 -0500 Subject: [PATCH 11/38] book: removes unused `_book_crc32` function --- pkg/vere/db/book.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 9dc38123ae..327bcb5ed8 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -28,15 +28,6 @@ */ #define BOOK_MAGIC 0x424f4f4b // "BOOK" #define BOOK_VERSION 1 // format version - #define BOOK_META_SIZE 256 // reserved metadata area size - -/* _book_crc32(): compute CRC32 checksum. -*/ -static c3_w -_book_crc32(c3_y* buf_y, c3_w len_w) -{ - return (c3_w)crc32(0L, buf_y, len_w); -} /* _book_crc32_two(): compute CRC32 over two buffers. */ From ed807d5562fcebac0b6961362bca7162e0d677d9 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Tue, 13 Jan 2026 11:34:17 -0500 Subject: [PATCH 12/38] book: removes `BOOK_META_SIZE` macro and replaces usage with `sizeof(u3_book_meta)` --- pkg/vere/db/book.c | 12 ++++++------ pkg/vere/db/book.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 327bcb5ed8..1dad5aefb9 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -105,7 +105,7 @@ _book_init_head(u3_book* txt_u) txt_u->hed_u.fir_d = 0; txt_u->hed_u.las_d = 0; txt_u->hed_u.off_w = sizeof(u3_book_head); - txt_u->hed_u.len_w = BOOK_META_SIZE; + txt_u->hed_u.len_w = sizeof(u3_book_meta); txt_u->dit_o = c3y; } @@ -329,7 +329,7 @@ _book_skip_deed(c3_i fid_i, c3_w* off_w) static c3_w _book_scan_end(u3_book* txt_u) { - c3_w off_w = sizeof(u3_book_head) + BOOK_META_SIZE; // start + c3_w off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); // start c3_d cot_d = 0; // count c3_d exp_d; // expected event number @@ -370,7 +370,7 @@ _book_scan_end(u3_book* txt_u) if ( cot_d == 0 ) { txt_u->hed_u.fir_d = 0; txt_u->hed_u.las_d = 0; - off_w = sizeof(u3_book_head) + BOOK_META_SIZE; + off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); } else { txt_u->hed_u.las_d = txt_u->hed_u.fir_d + cot_d - 1; } @@ -436,7 +436,7 @@ u3_book_init(const c3_c* pax_c) return 0; } // events start after header + reserved metadata area - txt_u->off_w = sizeof(u3_book_head) + BOOK_META_SIZE; + txt_u->off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); } else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { // corrupt file: too small @@ -683,7 +683,7 @@ u3_book_read(u3_book* txt_u, } // scan to starting event (events start after header + metadata area) - off_w = sizeof(u3_book_head) + BOOK_META_SIZE; + off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); cur_d = txt_u->hed_u.fir_d; while ( cur_d < eve_d ) { @@ -887,7 +887,7 @@ u3_book_walk_init(u3_book* txt_u, } // scan to starting event (events start after header + metadata area) - off_w = sizeof(u3_book_head) + BOOK_META_SIZE; + off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); cur_d = txt_u->hed_u.fir_d; while ( cur_d < nex_d ) { diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index 72fafcdbb9..ccfd69cab0 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -31,10 +31,10 @@ ** total: 256 bytes */ typedef struct _u3_book_meta { - c3_w ver_w; // metadata format version c3_d who_d[2]; // ship identity (16 bytes) - c3_o fak_o; // fake security flag (1 byte) + c3_w ver_w; // metadata format version c3_w lif_w; // lifecycle length (4 bytes) + c3_o fak_o; // fake security flag (1 byte) c3_y pad_y[231]; // reserved (231 bytes) } u3_book_meta; From 861fbdafa375a7d135404ffcee6922179723ce2d Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Wed, 14 Jan 2026 16:32:35 -0500 Subject: [PATCH 13/38] book: simplifies header, makes it immutable, and refactors accordingly --- pkg/vere/db/book.c | 581 ++++++++++++++++++++++++++++++--------------- pkg/vere/db/book.h | 19 +- 2 files changed, 392 insertions(+), 208 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 1dad5aefb9..7112c94117 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -16,13 +16,14 @@ // book: append-only event log // // simple file-based persistence layer for urbit's event log. -// optimized for sequential writes and reads, no random access. +// optimized for sequential writes and reads; no random access. // // file format: -// [64-byte header] -// [256-byte metadata section] +// [16-byte header (immutable)] // [events: len_d | mug_l | jam_data | crc_m | let_d] // +// metadata stored in separate meta.bin file +// /* constants */ @@ -38,29 +39,153 @@ _book_crc32_two(c3_y* one_y, c3_w one_w, c3_y* two_y, c3_w two_w) return (c3_w)crc32(crc_w, two_y, two_w); } -/* _book_save_head(): write header to file at offset 0. +/* _book_meta_path(): construct path to meta.bin from book.log path. +** +** caller must free result with c3_free() +*/ +static c3_c* +_book_meta_path(const c3_c* pax_c) +{ + c3_c* met_c = c3_malloc(strlen(pax_c) + 16); + c3_c* dir_c = c3_malloc(strlen(pax_c) + 1); + + if ( !met_c || !dir_c ) { + c3_free(met_c); + c3_free(dir_c); + return 0; + } + + strcpy(dir_c, pax_c); + + // find last '/' to get directory + c3_c* sla_c = strrchr(dir_c, '/'); + if ( sla_c ) { + *sla_c = '\0'; + } + + snprintf(met_c, strlen(pax_c) + 16, "%s/meta.bin", dir_c); + c3_free(dir_c); + return met_c; +} + +/* _book_init_meta_file(): open/create meta.bin file. +** +** returns: file descriptor, or -1 on error +*/ +static c3_i +_book_init_meta_file(const c3_c* pax_c) +{ + c3_c* met_c = _book_meta_path(pax_c); + c3_i met_i = c3_open(met_c, O_RDWR | O_CREAT, 0644); + + if ( 0 > met_i ) { + c3_free(met_c); + return -1; + } + + // check file size; if zero, initialize with blank metadata + struct stat buf_u; + if ( 0 > fstat(met_i, &buf_u) ) { + close(met_i); + c3_free(met_c); + return -1; + } + + if ( 0 == buf_u.st_size ) { + u3_book_meta met_u; + memset(&met_u, 0, sizeof(u3_book_meta)); + + c3_zs ret_zs = pwrite(met_i, &met_u, sizeof(u3_book_meta), 0); + if ( ret_zs != sizeof(u3_book_meta) ) { + close(met_i); + c3_free(met_c); + return -1; + } + + if ( -1 == c3_sync(met_i) ) { + close(met_i); + c3_free(met_c); + return -1; + } + } + + c3_free(met_c); + return met_i; +} + +/* _book_read_meta_file(): read metadata from meta.bin. +** +** returns: c3y on success, c3n on failure +*/ +static c3_o +_book_read_meta_file(c3_i met_i, u3_book_meta* met_u) +{ + if ( 0 > met_i ) { + return c3n; + } + + c3_zs ret_zs = pread(met_i, met_u, sizeof(u3_book_meta), 0); + if ( ret_zs != sizeof(u3_book_meta) ) { + return c3n; + } + + return c3y; +} + +/* _book_save_meta_file(): write metadata to meta.bin. +** +** returns: c3y on success, c3n on failure */ static c3_o -_book_save_head(u3_book* txt_u) +_book_save_meta_file(c3_i met_i, const u3_book_meta* met_u) +{ + if ( 0 > met_i ) { + return c3n; + } + + c3_zs ret_zs = pwrite(met_i, met_u, sizeof(u3_book_meta), 0); + if ( ret_zs != sizeof(u3_book_meta) ) { + return c3n; + } + + if ( -1 == c3_sync(met_i) ) { + return c3n; + } + + return c3y; +} + +/* _book_make_head(): initialize and write header for new file. +** +** header is write-once and immutable after creation. +*/ +static c3_o +_book_make_head(u3_book* txt_u) { c3_zs ret_zs; + // initialize header + memset(&txt_u->hed_u, 0, sizeof(u3_book_head)); + txt_u->hed_u.mag_w = BOOK_MAGIC; + txt_u->hed_u.ver_w = BOOK_VERSION; + txt_u->hed_u.fir_d = 0; + + // write header ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, sizeof(u3_book_head), 0); if ( ret_zs != sizeof(u3_book_head) ) { - fprintf(stderr, "book: failed to write header: %s\r\n", + u3l_log("book: failed to write header: %s\r\n", strerror(errno)); return c3n; } if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: failed to sync header: %s\r\n", + u3l_log("book: failed to sync header: %s\r\n", strerror(errno)); return c3n; } - txt_u->dit_o = c3n; return c3y; } @@ -94,51 +219,8 @@ _book_read_head(u3_book* txt_u) return c3y; } -/* _book_init_head(): initialize header for new file. -*/ -static void -_book_init_head(u3_book* txt_u) -{ - memset(&txt_u->hed_u, 0, sizeof(u3_book_head)); - txt_u->hed_u.mag_w = BOOK_MAGIC; - txt_u->hed_u.ver_w = BOOK_VERSION; - txt_u->hed_u.fir_d = 0; - txt_u->hed_u.las_d = 0; - txt_u->hed_u.off_w = sizeof(u3_book_head); - txt_u->hed_u.len_w = sizeof(u3_book_meta); - txt_u->dit_o = c3y; -} - -/* _book_init_meta(): initialize metadata section with zeros. -*/ -static c3_o -_book_init_meta(u3_book* txt_u) -{ - u3_book_meta met_u; - c3_zs ret_zs; - - // zero-initialize metadata - memset(&met_u, 0, sizeof(u3_book_meta)); - - // write metadata section at fixed offset - ret_zs = pwrite(txt_u->fid_i, &met_u, sizeof(u3_book_meta), - sizeof(u3_book_head)); - if ( ret_zs != sizeof(u3_book_meta) ) { - fprintf(stderr, "book: init_meta: failed to write metadata: %s\r\n", - strerror(errno)); - return c3n; - } - - // sync metadata to disk - if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: init_meta: failed to sync metadata: %s\r\n", - strerror(errno)); - return c3n; - } - return c3y; -} /* _book_deed_size(): calculate total on-disk size of deed. */ @@ -167,7 +249,7 @@ static c3_o _book_okay_reed(const u3_book_reed* red_u) { // validate length - if ( 0 == red_u->len_d || (1ULL << 32) < red_u->len_d ) { + if ( 0 == red_u->len_d ) { return c3n; } @@ -189,25 +271,19 @@ _book_okay_reed(const u3_book_reed* red_u) ** on success, caller must free red_u->jam_y */ static c3_o -_book_read_deed(c3_i fid_i, c3_w* off_w, u3_book_reed* red_u) +_book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) { c3_zs ret_zs; - c3_w now_w = *off_w; + c3_d now_d = *off_d; c3_d let_d; // read deed_head u3_book_deed_head hed_u; - ret_zs = pread(fid_i, &hed_u, sizeof(u3_book_deed_head), now_w); + ret_zs = pread(fid_i, &hed_u, sizeof(u3_book_deed_head), now_d); if ( ret_zs != sizeof(u3_book_deed_head) ) { return c3n; } - now_w += sizeof(u3_book_deed_head); - - // validate length - if ( 0 == hed_u.len_d || (1ULL << 32) < hed_u.len_d ) { - fprintf(stderr, "book: invalid length: %" PRIu64 "\r\n", hed_u.len_d); - return c3n; - } + now_d += sizeof(u3_book_deed_head); // populate reed from head red_u->len_d = hed_u.len_d; @@ -216,21 +292,24 @@ _book_read_deed(c3_i fid_i, c3_w* off_w, u3_book_reed* red_u) // read jam data (len_d - mug bytes) c3_d jaz_d = red_u->len_d - 4; red_u->jam_y = c3_malloc(jaz_d); - ret_zs = pread(fid_i, red_u->jam_y, jaz_d, now_w); + if ( !red_u->jam_y ) { + return c3n; + } + ret_zs = pread(fid_i, red_u->jam_y, jaz_d, now_d); if ( ret_zs != (c3_zs)jaz_d ) { c3_free(red_u->jam_y); return c3n; } - now_w += jaz_d; + now_d += jaz_d; // read deed_tail u3_book_deed_tail tal_u; - ret_zs = pread(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_w); + ret_zs = pread(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_d); if ( ret_zs != sizeof(u3_book_deed_tail) ) { c3_free(red_u->jam_y); return c3n; } - now_w += sizeof(u3_book_deed_tail); + now_d += sizeof(u3_book_deed_tail); // populate reed from tail red_u->crc_w = tal_u.crc_w; @@ -243,7 +322,7 @@ _book_read_deed(c3_i fid_i, c3_w* off_w, u3_book_reed* red_u) } // update offset - *off_w = now_w; + *off_d = now_d; return c3y; } @@ -255,10 +334,10 @@ _book_read_deed(c3_i fid_i, c3_w* off_w, u3_book_reed* red_u) ** c3n: failure */ static c3_o -_book_save_deed(c3_i fid_i, c3_w* off_w, const u3_book_reed* red_u) +_book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) { c3_zs ret_zs; - c3_w now_w = *off_w; + c3_d now_d = *off_d; c3_d jaz_d = red_u->len_d - 4; // len_d - mug bytes // write deed_head @@ -266,32 +345,32 @@ _book_save_deed(c3_i fid_i, c3_w* off_w, const u3_book_reed* red_u) hed_u.len_d = red_u->len_d; hed_u.mug_l = red_u->mug_l; - ret_zs = pwrite(fid_i, &hed_u, sizeof(u3_book_deed_head), now_w); + ret_zs = pwrite(fid_i, &hed_u, sizeof(u3_book_deed_head), now_d); if ( ret_zs != sizeof(u3_book_deed_head) ) { return c3n; } - now_w += sizeof(u3_book_deed_head); + now_d += sizeof(u3_book_deed_head); // write jam data - ret_zs = pwrite(fid_i, red_u->jam_y, jaz_d, now_w); + ret_zs = pwrite(fid_i, red_u->jam_y, jaz_d, now_d); if ( ret_zs != (c3_zs)jaz_d ) { return c3n; } - now_w += jaz_d; + now_d += jaz_d; // write deed_tail u3_book_deed_tail tal_u; tal_u.crc_w = red_u->crc_w; tal_u.let_d = red_u->len_d; // length trailer (same as len_d) - ret_zs = pwrite(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_w); + ret_zs = pwrite(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_d); if ( ret_zs != sizeof(u3_book_deed_tail) ) { return c3n; } - now_w += sizeof(u3_book_deed_tail); + now_d += sizeof(u3_book_deed_tail); // update offset - *off_w = now_w; + *off_d = now_d; return c3y; } @@ -303,19 +382,121 @@ _book_save_deed(c3_i fid_i, c3_w* off_w, const u3_book_reed* red_u) ** c3n: failure (EOF) */ static c3_o -_book_skip_deed(c3_i fid_i, c3_w* off_w) +_book_skip_deed(c3_i fid_i, c3_d* off_d) { c3_zs ret_zs; c3_d len_d; // read only the len_d field - ret_zs = pread(fid_i, &len_d, sizeof(c3_d), *off_w); + ret_zs = pread(fid_i, &len_d, sizeof(c3_d), *off_d); if ( ret_zs != sizeof(c3_d) ) { return c3n; } // skip entire deed: deed_head + jam + deed_tail - *off_w += _book_deed_size(len_d); + *off_d += _book_deed_size(len_d); + + return c3y; +} + +/* _book_scan_back(): reverse scan to find last valid deed. +** +** scans backwards from file end using trailing let_d field. +** on success, sets *off_d to append offset and updates txt_u->las_d. +** +** returns: +** c3y: success +** c3n: failure (empty file or no valid deeds) +*/ +static c3_o +_book_scan_back(u3_book* txt_u, c3_d* off_d) +{ + struct stat buf_u; + c3_d end_d; + c3_d pos_d; + c3_d cot_d = 0; // count of valid deeds found + + // get file size + if ( -1 == fstat(txt_u->fid_i, &buf_u) ) { + *off_d = sizeof(u3_book_head); + return c3n; + } + + end_d = (c3_d)buf_u.st_size; + + // check for empty or header-only file + if ( end_d <= sizeof(u3_book_head) ) { + *off_d = sizeof(u3_book_head); + return c3n; + } + + pos_d = end_d; + + // scan backwards + while ( pos_d > sizeof(u3_book_head) ) { + c3_zs ret_zs; + c3_d let_d; + c3_d siz_d; + c3_d ded_d; // deed start offset + + // need at least deed_tail size to read let_d + if ( pos_d < sizeof(u3_book_head) + sizeof(u3_book_deed_tail) ) { + break; + } + + // read let_d from end of deed (last 8 bytes before pos_d) + ret_zs = pread(txt_u->fid_i, &let_d, sizeof(c3_d), + pos_d - sizeof(c3_d)); + if ( ret_zs != sizeof(c3_d) ) { + break; + } + + // validate let_d is reasonable + if ( 0 == let_d || (1ULL << 32) < let_d ) { + break; + } + + // calculate deed size and start position + siz_d = _book_deed_size(let_d); + if ( siz_d > pos_d - sizeof(u3_book_head) ) { + // deed would extend before header + break; + } + + ded_d = pos_d - siz_d; + + // read and validate the deed + { + u3_book_reed red_u; + c3_d tmp_d = ded_d; + + if ( c3n == _book_read_deed(txt_u->fid_i, &tmp_d, &red_u) ) { + break; + } + + if ( c3n == _book_okay_reed(&red_u) ) { + c3_free(red_u.jam_y); + break; + } + + c3_free(red_u.jam_y); + } + + // deed is valid, record position and continue backwards + cot_d++; + pos_d = ded_d; + } + + // check if we found any valid deeds + if ( 0 == cot_d ) { + *off_d = sizeof(u3_book_head); + return c3n; + } + + // success: compute last event number + // cot_d deeds found, first event is fir_d + *off_d = end_d; + txt_u->las_d = txt_u->hed_u.fir_d + cot_d - 1; return c3y; } @@ -323,71 +504,77 @@ _book_skip_deed(c3_i fid_i, c3_w* off_w) /* _book_scan_end(): scan to find actual end of valid events. ** ** validates each record's CRC and len_d == let_d. +** caches las_d and off_d in txt_u. ** returns offset to append next event. -** updates header if corruption detected. */ -static c3_w +static c3_d _book_scan_end(u3_book* txt_u) { - c3_w off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); // start + c3_d off_d = sizeof(u3_book_head); // start of events c3_d cot_d = 0; // count + c3_d las_d = 0; // last valid event found c3_d exp_d; // expected event number - if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->hed_u.las_d ) { + if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->las_d ) { // empty log - return off_w; + txt_u->las_d = 0; + txt_u->off_d = off_d; + return off_d; } - exp_d = txt_u->hed_u.las_d - txt_u->hed_u.fir_d + 1; + exp_d = txt_u->las_d - txt_u->hed_u.fir_d + 1; while ( 1 ) { u3_book_reed red_u; - c3_w off_start = off_w; + c3_d off_start = off_d; // read deed into reed - if ( c3n == _book_read_deed(txt_u->fid_i, &off_w, &red_u) ) { + if ( c3n == _book_read_deed(txt_u->fid_i, &off_d, &red_u) ) { // EOF or read error break; } // validate reed (CRC and length checks) if ( c3n == _book_okay_reed(&red_u) ) { - fprintf(stderr, "book: validation failed at offset %u\r\n", off_start); + u3l_log("book: validation failed at offset %" PRIu64 "\r\n", off_start); c3_free(red_u.jam_y); break; } + las_d = txt_u->hed_u.fir_d + cot_d; c3_free(red_u.jam_y); cot_d++; } // check if we found fewer events than expected if ( cot_d != exp_d ) { - fprintf(stderr, "book: recovery: found %" PRIu64 " events, expected %" PRIu64 "\r\n", + u3l_log("book: recovery: found %" PRIu64 " events, expected %" PRIu64 "\r\n", cot_d, exp_d); - // update header - if ( cot_d == 0 ) { - txt_u->hed_u.fir_d = 0; - txt_u->hed_u.las_d = 0; - off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); + // update las_d based on what we found + if ( 0 == cot_d ) { + txt_u->las_d = 0; + off_d = sizeof(u3_book_head); } else { - txt_u->hed_u.las_d = txt_u->hed_u.fir_d + cot_d - 1; + txt_u->las_d = las_d; } - txt_u->dit_o = c3y; - _book_save_head(txt_u); - // truncate file - if ( -1 == ftruncate(txt_u->fid_i, off_w) ) { - fprintf(stderr, "book: failed to truncate: %s\r\n", + if ( -1 == ftruncate(txt_u->fid_i, off_d) ) { + u3l_log("book: failed to truncate: %s\r\n", strerror(errno)); } else { - c3_sync(txt_u->fid_i); + if ( -1 == c3_sync(txt_u->fid_i) ) { + u3l_log("book: failed to sync after truncate: %s\r\n", + strerror(errno)); + } } + } else { + txt_u->las_d = las_d; } - return off_w; + txt_u->off_d = off_d; + return off_d; } /* u3_book_init(): open/create event log. @@ -396,7 +583,7 @@ u3_book* u3_book_init(const c3_c* pax_c) { c3_c path_c[8193]; - c3_i fid_i; + c3_i fid_i, met_i; struct stat buf_u; u3_book* txt_u; @@ -406,43 +593,53 @@ u3_book_init(const c3_c* pax_c) // open or create file fid_i = c3_open(path_c, O_RDWR | O_CREAT, 0644); if ( 0 > fid_i ) { - fprintf(stderr, "book: failed to open %s: %s\r\n", + u3l_log("book: failed to open %s: %s\r\n", path_c, strerror(errno)); return 0; } + // open/create meta.bin file + met_i = _book_init_meta_file(pax_c); + if ( 0 > met_i ) { + u3l_log("book: failed to open meta.bin\r\n"); + close(fid_i); + return 0; + } + // get file size if ( 0 > fstat(fid_i, &buf_u) ) { - fprintf(stderr, "book: fstat failed: %s\r\n", strerror(errno)); + u3l_log("book: fstat failed: %s\r\n", strerror(errno)); close(fid_i); + close(met_i); return 0; } // allocate log structure txt_u = c3_calloc(sizeof(u3_book)); txt_u->fid_i = fid_i; + txt_u->met_i = met_i; txt_u->pax_c = c3_malloc(strlen(path_c) + 1); + if ( !txt_u->pax_c ) { + close(fid_i); + close(met_i); + c3_free(txt_u); + return 0; + } strcpy(txt_u->pax_c, path_c); if ( buf_u.st_size == 0 ) { - // new file: initialize header - _book_init_head(txt_u); - _book_save_head(txt_u); - // initialize metadata section - if ( c3n == _book_init_meta(txt_u) ) { - close(fid_i); - c3_free(txt_u->pax_c); - c3_free(txt_u); - return 0; - } - // events start after header + reserved metadata area - txt_u->off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); + // new file: initialize and write header + _book_make_head(txt_u); + // initialize cache: empty log + txt_u->las_d = 0; + txt_u->off_d = sizeof(u3_book_head); } else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { // corrupt file: too small - fprintf(stderr, "book: file too small: %lld bytes\r\n", + u3l_log("book: file too small: %lld bytes\r\n", (long long)buf_u.st_size); close(fid_i); + close(met_i); c3_free(txt_u->pax_c); c3_free(txt_u); return 0; @@ -451,13 +648,17 @@ u3_book_init(const c3_c* pax_c) // existing file: read and validate header if ( c3n == _book_read_head(txt_u) ) { close(fid_i); + close(met_i); c3_free(txt_u->pax_c); c3_free(txt_u); return 0; } - // scan to find actual end, recover from corruption - txt_u->off_w = _book_scan_end(txt_u); + // try fast reverse scan first, fall back to forward scan if needed + if ( c3n == _book_scan_back(txt_u, &txt_u->off_d) ) { + // reverse scan failed, use forward scan for recovery + _book_scan_end(txt_u); + } } return txt_u; @@ -472,14 +673,14 @@ u3_book_exit(u3_book* txt_u) return; } - // sync header if dirty - if ( c3y == txt_u->dit_o ) { - _book_save_head(txt_u); - } - - // close file + // close book.log file close(txt_u->fid_i); + // close meta.bin file + if ( 0 <= txt_u->met_i ) { + close(txt_u->met_i); + } + // free resources c3_free(txt_u->pax_c); c3_free(txt_u); @@ -495,7 +696,7 @@ u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d) } *low_d = txt_u->hed_u.fir_d; - *hig_d = txt_u->hed_u.las_d; + *hig_d = txt_u->las_d; return c3y; } @@ -545,10 +746,6 @@ u3_book_stat(const c3_c* pax_c) fprintf(stderr, " file: %s\r\n", pax_c); fprintf(stderr, " version: %u\r\n", hed_u.ver_w); fprintf(stderr, " first event: %" PRIu64 "\r\n", hed_u.fir_d); - fprintf(stderr, " last event: %" PRIu64 "\r\n", hed_u.las_d); - fprintf(stderr, " event count: %" PRIu64 "\r\n", - (0 == hed_u.las_d ) ? 0 : - (hed_u.las_d - hed_u.fir_d + 1)); fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); close(fid_i); @@ -567,14 +764,14 @@ u3_book_save(u3_book* txt_u, c3_z* siz_i, c3_d epo_d) { - c3_w now_w; + c3_d now_d; if ( !txt_u ) { return c3n; } // validate contiguity - if ( 0 == txt_u->hed_u.las_d ) { + if ( 0 == txt_u->hed_u.fir_d ) { // empty log: first event must be the first event in the epoch if ( epo_d + 1 != eve_d ) { fprintf(stderr, "book: first event must be start of epoch, " @@ -583,18 +780,26 @@ u3_book_save(u3_book* txt_u, return c3n; } txt_u->hed_u.fir_d = eve_d; + + // persist fir_d (write-once) + if ( sizeof(c3_d) != pwrite(txt_u->fid_i, &txt_u->hed_u.fir_d, + sizeof(c3_d), offsetof(u3_book_head, fir_d)) ) + { + fprintf(stderr, "book: failed to write fir_d: %s\r\n", strerror(errno)); + return c3n; + } } else { // non-empty: must be contiguous - if ( eve_d != txt_u->hed_u.las_d + 1 ) { + if ( eve_d != txt_u->las_d + 1 ) { fprintf(stderr, "book: event gap: expected %" PRIu64 ", got %" PRIu64 "\r\n", - txt_u->hed_u.las_d + 1, eve_d); + txt_u->las_d + 1, eve_d); return c3n; } } // write each event deed - now_w = txt_u->off_w; + now_d = txt_u->off_d; for ( c3_w i_w = 0; i_w < len_d; i_w++ ) { c3_y* buf_y = (c3_y*)byt_p[i_w]; @@ -615,7 +820,7 @@ u3_book_save(u3_book* txt_u, red_u.crc_w = _book_calc_crc(&red_u); // save deed to file - if ( c3n == _book_save_deed(txt_u->fid_i, &now_w, &red_u) ) { + if ( c3n == _book_save_deed(txt_u->fid_i, &now_d, &red_u) ) { fprintf(stderr, "book: failed to save deed for event %" PRIu64 ": %s\r\n", eve_d + i_w, strerror(errno)); return c3n; @@ -629,15 +834,9 @@ u3_book_save(u3_book* txt_u, return c3n; } - // update header - txt_u->hed_u.las_d = eve_d + len_d - 1; - txt_u->off_w = now_w; - txt_u->dit_o = c3y; - - // write and sync header - if ( c3n == _book_save_head(txt_u) ) { - return c3n; - } + // update cache + txt_u->las_d = eve_d + len_d - 1; + txt_u->off_d = now_d; return c3y; } @@ -657,7 +856,7 @@ u3_book_read(u3_book* txt_u, c3_d len_d, c3_o (*read_f)(void*, c3_d, c3_z, void*)) { - c3_w off_w; + c3_d off_d; c3_d cur_d; if ( !txt_u ) { @@ -665,29 +864,29 @@ u3_book_read(u3_book* txt_u, } // validate range - if ( 0 == txt_u->hed_u.las_d ) { + if ( 0 == txt_u->las_d ) { // empty log fprintf(stderr, "book: read from empty log\r\n"); return c3n; } - if ( eve_d < txt_u->hed_u.fir_d || eve_d > txt_u->hed_u.las_d ) { + if ( eve_d < txt_u->hed_u.fir_d || eve_d > txt_u->las_d ) { fprintf(stderr, "book: event %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", - eve_d, txt_u->hed_u.fir_d, txt_u->hed_u.las_d); + eve_d, txt_u->hed_u.fir_d, txt_u->las_d); return c3n; } - if ( eve_d + len_d - 1 > txt_u->hed_u.las_d ) { + if ( eve_d + len_d - 1 > txt_u->las_d ) { fprintf(stderr, "book: read range exceeds last event\r\n"); return c3n; } - // scan to starting event (events start after header + metadata area) - off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); + // scan to starting event (events start after header) + off_d = sizeof(u3_book_head); cur_d = txt_u->hed_u.fir_d; while ( cur_d < eve_d ) { - if ( c3n == _book_skip_deed(txt_u->fid_i, &off_w) ) { + if ( c3n == _book_skip_deed(txt_u->fid_i, &off_d) ) { fprintf(stderr, "book: failed to scan to event %" PRIu64 "\r\n", eve_d); return c3n; } @@ -701,7 +900,7 @@ u3_book_read(u3_book* txt_u, c3_z len_z; // read deed into reed - if ( c3n == _book_read_deed(txt_u->fid_i, &off_w, &red_u) ) { + if ( c3n == _book_read_deed(txt_u->fid_i, &off_d, &red_u) ) { fprintf(stderr, "book: failed to read event %" PRIu64 "\r\n", cur_d); return c3n; } @@ -716,6 +915,10 @@ u3_book_read(u3_book* txt_u, // reconstruct buffer in mug + jam format for callback len_z = red_u.len_d; buf_y = c3_malloc(len_z); + if ( !buf_y ) { + c3_free(red_u.jam_y); + return c3n; + } memcpy(buf_y, &red_u.mug_l, 4); memcpy(buf_y + 4, red_u.jam_y, red_u.len_d - 4); @@ -745,20 +948,15 @@ u3_book_read_meta(u3_book* txt_u, void (*read_f)(void*, c3_zs, void*)) { u3_book_meta met_u; - c3_zs ret_zs; if ( !txt_u ) { read_f(ptr_v, -1, 0); return; } - // read metadata section at fixed offset - ret_zs = pread(txt_u->fid_i, &met_u, sizeof(u3_book_meta), - sizeof(u3_book_head)); - - if ( ret_zs != sizeof(u3_book_meta) ) { - fprintf(stderr, "book: read_meta: failed to read metadata: %s\r\n", - strerror(errno)); + // read metadata from meta.bin + if ( c3n == _book_read_meta_file(txt_u->met_i, &met_u) ) { + u3l_log("book: read_meta: failed to read metadata\r\n"); read_f(ptr_v, -1, 0); return; } @@ -781,7 +979,7 @@ u3_book_read_meta(u3_book* txt_u, } } -/* u3_book_save_meta(): write fixed metadata section via callback. +/* u3_book_save_meta(): write fixed metadata section. ** ** key_c: metadata key ** val_z: value size in bytes @@ -794,19 +992,14 @@ u3_book_save_meta(u3_book* txt_u, void* val_p) { u3_book_meta met_u; - c3_zs ret_zs; if ( !txt_u ) { return c3n; } - // read current metadata - ret_zs = pread(txt_u->fid_i, &met_u, sizeof(u3_book_meta), - sizeof(u3_book_head)); - - if ( ret_zs != sizeof(u3_book_meta) ) { - fprintf(stderr, "book: save_meta: failed to read current metadata: %s\r\n", - strerror(errno)); + // read current metadata from meta.bin + if ( c3n == _book_read_meta_file(txt_u->met_i, &met_u) ) { + u3l_log("book: save_meta: failed to read current metadata\r\n"); return c3n; } @@ -831,20 +1024,9 @@ u3_book_save_meta(u3_book* txt_u, return c3n; } - // write metadata section at fixed offset - ret_zs = pwrite(txt_u->fid_i, &met_u, sizeof(u3_book_meta), - sizeof(u3_book_head)); - - if ( ret_zs != sizeof(u3_book_meta) ) { - fprintf(stderr, "book: save_meta: failed to write metadata: %s\r\n", - strerror(errno)); - return c3n; - } - - // sync metadata to disk - if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: save_meta: failed to sync metadata: %s\r\n", - strerror(errno)); + // write metadata to meta.bin + if ( c3n == _book_save_meta_file(txt_u->met_i, &met_u) ) { + u3l_log("book: save_meta: failed to write metadata\r\n"); return c3n; } @@ -861,7 +1043,7 @@ u3_book_walk_init(u3_book* txt_u, c3_d nex_d, c3_d las_d) { - c3_w off_w; + c3_d off_d; c3_d cur_d; if ( !txt_u || !itr_u ) { @@ -869,29 +1051,29 @@ u3_book_walk_init(u3_book* txt_u, } // validate range - if ( 0 == txt_u->hed_u.las_d ) { + if ( 0 == txt_u->las_d ) { fprintf(stderr, "book: walk_init on empty log\r\n"); return c3n; } - if ( nex_d < txt_u->hed_u.fir_d || nex_d > txt_u->hed_u.las_d ) { + if ( nex_d < txt_u->hed_u.fir_d || nex_d > txt_u->las_d ) { fprintf(stderr, "book: walk_init start %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", - nex_d, txt_u->hed_u.fir_d, txt_u->hed_u.las_d); + nex_d, txt_u->hed_u.fir_d, txt_u->las_d); return c3n; } - if ( las_d < nex_d || las_d > txt_u->hed_u.las_d ) { + if ( las_d < nex_d || las_d > txt_u->las_d ) { fprintf(stderr, "book: walk_init end %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", - las_d, nex_d, txt_u->hed_u.las_d); + las_d, nex_d, txt_u->las_d); return c3n; } - // scan to starting event (events start after header + metadata area) - off_w = sizeof(u3_book_head) + sizeof(u3_book_meta); + // scan to starting event (events start after header) + off_d = sizeof(u3_book_head); cur_d = txt_u->hed_u.fir_d; while ( cur_d < nex_d ) { - if ( c3n == _book_skip_deed(txt_u->fid_i, &off_w) ) { + if ( c3n == _book_skip_deed(txt_u->fid_i, &off_d) ) { fprintf(stderr, "book: walk_init failed to scan to event %" PRIu64 "\r\n", nex_d); return c3n; } @@ -902,7 +1084,7 @@ u3_book_walk_init(u3_book* txt_u, itr_u->fid_i = txt_u->fid_i; itr_u->nex_d = nex_d; itr_u->las_d = las_d; - itr_u->off_w = off_w; + itr_u->off_d = off_d; itr_u->liv_o = c3y; return c3y; @@ -930,7 +1112,7 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) } // read deed into reed - if ( c3n == _book_read_deed(itr_u->fid_i, &itr_u->off_w, &red_u) ) { + if ( c3n == _book_read_deed(itr_u->fid_i, &itr_u->off_d, &red_u) ) { fprintf(stderr, "book: walk_next failed to read event %" PRIu64 "\r\n", itr_u->nex_d); itr_u->liv_o = c3n; @@ -949,6 +1131,11 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) // reconstruct buffer in mug + jam format *len_z = red_u.len_d; buf_y = c3_malloc(*len_z); + if ( !buf_y ) { + c3_free(red_u.jam_y); + itr_u->liv_o = c3n; + return c3n; + } memcpy(buf_y, &red_u.mug_l, 4); memcpy(buf_y + 4, red_u.jam_y, red_u.len_d - 4); diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index ccfd69cab0..fe257d1367 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -7,16 +7,12 @@ /* book: append-only event log */ - /* u3_book_head: on-disk file header (64 bytes) + /* u3_book_head: on-disk file header (16 bytes, immutable) */ typedef struct _u3_book_head { c3_w mag_w; // magic number: 0x424f4f4b ("BOOK") c3_w ver_w; // format version: 1 c3_d fir_d; // first event number in file - c3_d las_d; // last event number in file - c3_w off_w; // offset to metadata section - c3_w len_w; // length of metadata section (reserved, currently unused) - c3_y pad_y[32]; // reserved for future use, zeroed } u3_book_head; /* u3_book_meta: on-disk metadata format (fixed 256 bytes) @@ -41,11 +37,12 @@ /* u3_book: event log handle */ typedef struct _u3_book { - c3_i fid_i; // file descriptor - c3_c* pax_c; // file path - u3_book_head hed_u; // cached header - c3_w off_w; // append offset (end of last event) - c3_o dit_o; // header needs sync + c3_i fid_i; // file descriptor for book.log + c3_i met_i; // file descriptor for meta.bin + c3_c* pax_c; // file path to book.log + u3_book_head hed_u; // cached header (immutable) + c3_d las_d; // cached last event number + c3_d off_d; // cached append offset (end of last event) } u3_book; /* u3_book_walk: event iterator @@ -54,7 +51,7 @@ c3_i fid_i; // file descriptor c3_d nex_d; // next event number to read c3_d las_d; // last event number, inclusive - c3_w off_w; // current file offset + c3_d off_d; // current file offset c3_o liv_o; // iterator valid } u3_book_walk; From e988500f99c5f22c86e0a7e07da3e58f0aecaae7 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 19 Jan 2026 09:59:41 -0500 Subject: [PATCH 14/38] book: sync first event number of header before writing events to log --- pkg/vere/db/book.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 7112c94117..a2d60c82a8 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -788,6 +788,12 @@ u3_book_save(u3_book* txt_u, fprintf(stderr, "book: failed to write fir_d: %s\r\n", strerror(errno)); return c3n; } + + // sync fir_d before writing deeds to ensure header is durable + if ( -1 == c3_sync(txt_u->fid_i) ) { + fprintf(stderr, "book: failed to sync fir_d: %s\r\n", strerror(errno)); + return c3n; + } } else { // non-empty: must be contiguous From 6f621f713a8d34fbdea8f41c7f7c7f4e42fee819 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 19 Jan 2026 09:59:55 -0500 Subject: [PATCH 15/38] book: improves tests --- pkg/vere/book_tests.c | 3596 +++++++++++++++-------------------------- 1 file changed, 1283 insertions(+), 2313 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index 06a8a8e888..75271a4d56 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -1,2686 +1,1656 @@ -/// @file - #include "db/book.h" -#include "vere.h" -#include -#include -#include -#include #include +#include +#include +#include #include +#include +#include -/* test helpers -*/ +#define _alloc(sz) malloc(sz) +#define _free(ptr) free(ptr) -/* _test_tmpdir(): create temporary test directory. +/* _test_make_tmpdir(): create unique temporary directory. +** +** returns: heap-allocated path (caller must free) */ static c3_c* -_test_tmpdir(const c3_c* prefix) +_test_make_tmpdir(void) { - c3_c* tmp_c = c3_malloc(256); - snprintf(tmp_c, 256, "/tmp/%s-XXXXXX", prefix); + c3_c pat_c[] = "/tmp/book_test_XXXXXX"; + c3_c* dir_c = mkdtemp(pat_c); - if ( !mkdtemp(tmp_c) ) { - fprintf(stderr, "book_tests: failed to create temp dir\r\n"); - c3_free(tmp_c); + if ( !dir_c ) { + fprintf(stderr, "book_test: mkdtemp failed: %s\r\n", strerror(errno)); return 0; } - return tmp_c; + c3_c* ret_c = _alloc(strlen(dir_c) + 1); + strcpy(ret_c, dir_c); + return ret_c; } -/* _test_cleanup(): remove test directory and contents. +/* _test_rm_rf(): recursively remove directory contents. */ static void -_test_cleanup(const c3_c* dir_c) +_test_rm_rf(const c3_c* pax_c) { - c3_c cmd_c[512]; - snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", dir_c); + c3_c cmd_c[8192]; + snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", pax_c); system(cmd_c); } -/* _test_make_event(): create a fake event buffer (mug + jam data). -*/ -static void -_test_make_event(c3_y** buf_y, c3_z* siz_z, c3_d eve_d) -{ - // simple fake event: 4-byte mug + variable jam data - // mug = eve_d as 32-bit value - // jam = repeating pattern based on eve_d - - c3_w mug_w = (c3_w)eve_d; - c3_z jam_len = 16 + (eve_d % 32); // 16-48 bytes of jam data - - *siz_z = 4 + jam_len; - *buf_y = c3_malloc(*siz_z); - - memcpy(*buf_y, &mug_w, 4); - - // fill jam data with pattern - for ( c3_z i = 0; i < jam_len; i++ ) { - (*buf_y)[4 + i] = (c3_y)((eve_d + i) & 0xff); - } -} - -/* _test_verify_event(): verify event buffer matches expected. -*/ -static c3_o -_test_verify_event(c3_d eve_d, c3_z siz_z, void* buf_v) -{ - c3_y* buf_y = (c3_y*)buf_v; - c3_w mug_w; - c3_z expected_len; - - memcpy(&mug_w, buf_y, 4); - - if ( mug_w != (c3_w)eve_d ) { - fprintf(stderr, "book_tests: event %" PRIu64 " mug mismatch: got %u\r\n", eve_d, mug_w); - return c3n; - } - - expected_len = 16 + (eve_d % 32); - - if ( siz_z != 4 + expected_len ) { - fprintf(stderr, "book_tests: event %" PRIu64 " size mismatch: got %zu, expected %zu (4 + %zu)\r\n", - eve_d, siz_z, 4 + expected_len, expected_len); - return c3n; - } - - // verify jam data pattern - for ( c3_z i = 0; i < expected_len; i++ ) { - if ( buf_y[4 + i] != (c3_y)((eve_d + i) & 0xff) ) { - fprintf(stderr, "book_tests: event %" PRIu64 " data mismatch at offset %zu\r\n", - eve_d, i); - return c3n; - } - } - - return c3y; -} - -/* corruption test helpers -*/ - -/* _test_get_book_path(): build path to book.log file. -*/ -static void -_test_get_book_path(const c3_c* dir_c, c3_c* path_c, c3_z max_z) -{ - snprintf(path_c, max_z, "%s/book.log", dir_c); -} - -/* _test_get_file_size(): get size of book.log file. +/* _test_make_event(): create a test event buffer (mug + jam). +** +** creates a buffer with 4-byte mug followed by jam data. +** jam data is just the event number repeated. +** +** returns: heap-allocated buffer (caller must free) */ -static c3_o -_test_get_file_size(const c3_c* dir_c, c3_z* siz_z) +static c3_y* +_test_make_event(c3_z* len_z, c3_d eve_d) { - c3_c path_c[8193]; - struct stat st; + // create simple jam data: 8 bytes containing the event number + c3_z jam_z = 8; + c3_z tot_z = 4 + jam_z; // mug + jam + c3_y* buf_y = _alloc(tot_z); - _test_get_book_path(dir_c, path_c, sizeof(path_c)); + // mug: use event number as simple hash + c3_w mug_w = (c3_w)(eve_d * 0x12345678); + memcpy(buf_y, &mug_w, 4); - if ( 0 != stat(path_c, &st) ) { - fprintf(stderr, "book_tests: stat failed: %s\r\n", path_c); - return c3n; - } + // jam: event number as 8 bytes + memcpy(buf_y + 4, &eve_d, 8); - *siz_z = st.st_size; - return c3y; + *len_z = tot_z; + return buf_y; } -/* _test_calculate_event_offset(): calculate byte offset to specific event. +/* _test_corrupt_file(): flip a byte in a file at given offset. */ static c3_o -_test_calculate_event_offset(const c3_c* dir_c, c3_d target_eve, c3_w* off_w) +_test_corrupt_file(const c3_c* pax_c, c3_d off_d) { - c3_c path_c[8193]; - c3_i fid_i; - u3_book_head hed_u; - c3_d cur_d; - c3_w cur_off; - c3_zs ret_zs; - - _test_get_book_path(dir_c, path_c, sizeof(path_c)); - - fid_i = c3_open(path_c, O_RDONLY, 0); - if ( 0 > fid_i ) { - fprintf(stderr, "book_tests: open failed: %s\r\n", path_c); + c3_i fid_i = open(pax_c, O_RDWR); + if ( fid_i < 0 ) { return c3n; } - // read header - ret_zs = pread(fid_i, &hed_u, sizeof(u3_book_head), 0); - if ( sizeof(u3_book_head) != ret_zs ) { - fprintf(stderr, "book_tests: header read failed\r\n"); + c3_y byt_y; + if ( 1 != pread(fid_i, &byt_y, 1, off_d) ) { close(fid_i); return c3n; } - // allow target beyond current range (for corruption tests) - // just scan up to target or last event - c3_d scan_to = (target_eve <= hed_u.las_d) ? target_eve : hed_u.las_d + 1; - - // scan to target event - cur_off = 64 + 256; // sizeof(u3_book_head) + BOOK_META_SIZE - - for ( cur_d = hed_u.fir_d; cur_d < scan_to; cur_d++ ) { - u3_book_deed_head deed_hed; - - ret_zs = pread(fid_i, &deed_hed, sizeof(u3_book_deed_head), cur_off); - if ( sizeof(u3_book_deed_head) != ret_zs ) { - fprintf(stderr, "book_tests: deed header read failed at event %" PRIu64 " offset %u\r\n", - cur_d, cur_off); - close(fid_i); - return c3n; - } - - // total deed size = head(16 with padding) + (len_d - 4) + tail(16 with padding) - // = 16 + (len_d - 4) + 16 = len_d + 28 - cur_off += (deed_hed.len_d + 28); - } - - close(fid_i); - *off_w = cur_off; - return c3y; -} - -/* _test_corrupt_magic(): corrupt magic number in header. -*/ -static c3_o -_test_corrupt_magic(const c3_c* dir_c, c3_w bad_magic) -{ - c3_c path_c[8193]; - c3_i fid_i; - c3_zs ret_zs; - - _test_get_book_path(dir_c, path_c, sizeof(path_c)); - - fid_i = c3_open(path_c, O_RDWR, 0); - if ( 0 > fid_i ) { - fprintf(stderr, "book_tests: corrupt_magic open failed\r\n"); - return c3n; - } + byt_y ^= 0xFF; // flip all bits - ret_zs = pwrite(fid_i, &bad_magic, sizeof(c3_w), 0); - if ( sizeof(c3_w) != ret_zs ) { - fprintf(stderr, "book_tests: corrupt_magic write failed\r\n"); + if ( 1 != pwrite(fid_i, &byt_y, 1, off_d) ) { close(fid_i); return c3n; } - c3_sync(fid_i); close(fid_i); return c3y; } -/* _test_corrupt_version(): corrupt version in header. +/* _test_truncate_file(): truncate file to given size. */ static c3_o -_test_corrupt_version(const c3_c* dir_c, c3_w bad_version) +_test_truncate_file(const c3_c* pax_c, c3_d siz_d) { - c3_c path_c[8193]; - c3_i fid_i; - c3_zs ret_zs; - - _test_get_book_path(dir_c, path_c, sizeof(path_c)); - - fid_i = c3_open(path_c, O_RDWR, 0); - if ( 0 > fid_i ) { - fprintf(stderr, "book_tests: corrupt_version open failed\r\n"); - return c3n; - } - - ret_zs = pwrite(fid_i, &bad_version, sizeof(c3_w), 4); // offset 4 - if ( sizeof(c3_w) != ret_zs ) { - fprintf(stderr, "book_tests: corrupt_version write failed\r\n"); - close(fid_i); + if ( -1 == truncate(pax_c, siz_d) ) { return c3n; } - - c3_sync(fid_i); - close(fid_i); return c3y; } -/* _test_corrupt_event_crc(): corrupt CRC of specific event. +/* _test_append_garbage(): append random bytes to file. */ static c3_o -_test_corrupt_event_crc(const c3_c* dir_c, c3_d eve_d) +_test_append_garbage(const c3_c* pax_c, c3_z len_z) { - c3_c path_c[8193]; - c3_i fid_i; - c3_w event_off, crc_off; - u3_book_deed_head deed_hed; - c3_w bad_crc = 0xDEADBEEF; - c3_zs ret_zs; - - // calculate offset to event - if ( c3n == _test_calculate_event_offset(dir_c, eve_d, &event_off) ) { - return c3n; - } - - _test_get_book_path(dir_c, path_c, sizeof(path_c)); - - fid_i = c3_open(path_c, O_RDWR, 0); - if ( 0 > fid_i ) { - fprintf(stderr, "book_tests: corrupt_event_crc open failed\r\n"); - return c3n; - } - - // read deed header to get len_d - ret_zs = pread(fid_i, &deed_hed, sizeof(u3_book_deed_head), event_off); - if ( sizeof(u3_book_deed_head) != ret_zs ) { - fprintf(stderr, "book_tests: corrupt_event_crc deed read failed\r\n"); - close(fid_i); + c3_i fid_i = open(pax_c, O_WRONLY | O_APPEND); + if ( fid_i < 0 ) { return c3n; } - // CRC offset = event_off + head(16 with padding) + (len_d - 4) - crc_off = event_off + 16 + (deed_hed.len_d - 4); - - ret_zs = pwrite(fid_i, &bad_crc, sizeof(c3_w), crc_off); - if ( sizeof(c3_w) != ret_zs ) { - fprintf(stderr, "book_tests: corrupt_event_crc write failed\r\n"); - close(fid_i); - return c3n; + c3_y* buf_y = _alloc(len_z); + for ( c3_z i = 0; i < len_z; i++ ) { + buf_y[i] = (c3_y)(i * 17 + 42); // pseudo-random } - c3_sync(fid_i); + c3_zs ret = write(fid_i, buf_y, len_z); + _free(buf_y); close(fid_i); - return c3y; + + return (ret == (c3_zs)len_z) ? c3y : c3n; } -/* _test_corrupt_event_length_tail(): corrupt let_d in event trailer. +/* _test_write_raw(): write raw bytes at offset in file. */ static c3_o -_test_corrupt_event_length_tail(const c3_c* dir_c, c3_d eve_d, c3_d bad_let_d) +_test_write_raw(const c3_c* pax_c, c3_d off_d, void* dat_v, c3_z len_z) { - c3_c path_c[8193]; - c3_i fid_i; - c3_w event_off, let_off; - u3_book_deed_head deed_hed; - c3_zs ret_zs; - - // calculate offset to event - if ( c3n == _test_calculate_event_offset(dir_c, eve_d, &event_off) ) { - return c3n; - } - - _test_get_book_path(dir_c, path_c, sizeof(path_c)); - - fid_i = c3_open(path_c, O_RDWR, 0); - if ( 0 > fid_i ) { - fprintf(stderr, "book_tests: corrupt_event_length open failed\r\n"); - return c3n; - } - - // read deed header to get len_d - ret_zs = pread(fid_i, &deed_hed, sizeof(u3_book_deed_head), event_off); - if ( sizeof(u3_book_deed_head) != ret_zs ) { - fprintf(stderr, "book_tests: corrupt_event_length deed read failed\r\n"); - close(fid_i); - return c3n; - } - - // let_d offset = event_off + head(16 with padding) + (len_d - 4) + crc_w(4) - let_off = event_off + 16 + (deed_hed.len_d - 4) + 4; - - ret_zs = pwrite(fid_i, &bad_let_d, sizeof(c3_d), let_off); - if ( sizeof(c3_d) != ret_zs ) { - fprintf(stderr, "book_tests: corrupt_event_length write failed\r\n"); - close(fid_i); + c3_i fid_i = open(pax_c, O_RDWR); + if ( fid_i < 0 ) { return c3n; } - c3_sync(fid_i); + c3_zs ret = pwrite(fid_i, dat_v, len_z, off_d); close(fid_i); - return c3y; + + return (ret == (c3_zs)len_z) ? c3y : c3n; } -/* _test_truncate_file(): truncate book.log to specific offset. +/* _test_file_size(): get file size. */ -static c3_o -_test_truncate_file(const c3_c* dir_c, c3_w offset) +static c3_d +_test_file_size(const c3_c* pax_c) { - c3_c path_c[8193]; - - _test_get_book_path(dir_c, path_c, sizeof(path_c)); - - if ( 0 != truncate(path_c, offset) ) { - fprintf(stderr, "book_tests: truncate failed at offset %u\r\n", offset); - return c3n; + struct stat buf_u; + if ( -1 == stat(pax_c, &buf_u) ) { + return 0; } - - return c3y; + return (c3_d)buf_u.st_size; } -/* read callback context +/* _test_read_cb(): callback for u3_book_read that stores event data. */ -typedef struct _read_ctx { - c3_d count; - c3_d expected_start; - c3_o failed; -} read_ctx; +typedef struct { + c3_d eve_d; + c3_z len_z; + c3_y* buf_y; + c3_o called; +} _test_read_ctx; -/* _test_read_cb(): callback for u3_book_read(). -*/ static c3_o -_test_read_cb(void* ptr_v, c3_d eve_d, c3_z siz_z, void* buf_v) +_test_read_cb(void* ptr_v, c3_d eve_d, c3_z len_z, void* buf_v) { - read_ctx* ctx = (read_ctx*)ptr_v; - - if ( eve_d != ctx->expected_start + ctx->count ) { - fprintf(stderr, "book_tests: read callback event mismatch: %" PRIu64 " vs %" PRIu64 "\r\n", - eve_d, ctx->expected_start + ctx->count); - ctx->failed = c3y; - return c3n; - } - - if ( c3n == _test_verify_event(eve_d, siz_z, buf_v) ) { - ctx->failed = c3y; - return c3n; - } - - ctx->count++; + _test_read_ctx* ctx_u = ptr_v; + ctx_u->eve_d = eve_d; + ctx_u->len_z = len_z; + ctx_u->buf_y = _alloc(len_z); + ctx_u->called = c3y; + memcpy(ctx_u->buf_y, buf_v, len_z); return c3y; } -/* tests +/* _test_meta_cb(): callback for u3_book_read_meta. */ +typedef struct { + c3_zs siz_zs; + c3_y buf_y[256]; +} _test_meta_ctx; -/* _test_book_init_empty(): test creating new empty log. -*/ -static c3_o -_test_book_init_empty(void) +static void +_test_meta_cb(void* ptr_v, c3_zs siz_zs, void* dat_v) { - c3_c* tmp_c = _test_tmpdir("book-init"); - u3_book* log_u; - c3_d low_d, hig_d; - - if ( !tmp_c ) { - return c3n; - } - - // create new log - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: init failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // verify empty gulf - if ( c3n == u3_book_gulf(log_u, &low_d, &hig_d) ) { - fprintf(stderr, "book_tests: gulf failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - if ( 0 != low_d || 0 != hig_d ) { - fprintf(stderr, "book_tests: empty gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + _test_meta_ctx* ctx_u = ptr_v; + ctx_u->siz_zs = siz_zs; + if ( siz_zs > 0 && dat_v ) { + memcpy(ctx_u->buf_y, dat_v, (c3_z)siz_zs); } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; } -/* _test_book_single_event(): test writing and reading single event. -*/ -static c3_o -_test_book_single_event(void) -{ - c3_c* tmp_c = _test_tmpdir("book-single"); - u3_book* log_u; - c3_y* buf_y; - c3_z siz_z; - c3_d low_d, hig_d; - read_ctx ctx = {0}; - - if ( !tmp_c ) { - return c3n; - } - - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // create and save event 1 - _test_make_event(&buf_y, &siz_z, 1); - - if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: save failed\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - c3_free(buf_y); - - // verify gulf - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 1 != hig_d ) { - fprintf(stderr, "book_tests: single gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // read event back - ctx.expected_start = 1; - ctx.count = 0; - ctx.failed = c3n; - if ( c3n == u3_book_read(log_u, &ctx, 1, 1, _test_read_cb) ) { - fprintf(stderr, "book_tests: read failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - if ( c3y == ctx.failed || 1 != ctx.count ) { - fprintf(stderr, "book_tests: read verify failed (failed=%u, count=%" PRIu64 ")\r\n", - ctx.failed, ctx.count); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} +//============================================================================== +// Boundary Condition Tests +//============================================================================== -/* _test_book_batch_write(): test writing batch of 100 events. +/* _test_empty_log_operations(): test operations on empty log. */ -static c3_o -_test_book_batch_write(void) +static c3_i +_test_empty_log_operations(void) { - c3_c* tmp_c = _test_tmpdir("book-batch"); - u3_book* log_u; - void* bufs[100]; - c3_z sizes[100]; - c3_d i, low_d, hig_d; - read_ctx ctx = {0}; - - if ( !tmp_c ) { - return c3n; - } - - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // create 100 events - for ( i = 0; i < 100; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - // write batch - if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: batch save failed\r\n"); - for ( i = 0; i < 100; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // free buffers - for ( i = 0; i < 100; i++ ) { - c3_free(bufs[i]); - } - - // verify gulf - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 100 != hig_d ) { - fprintf(stderr, "book_tests: batch gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - // read all events back - ctx.expected_start = 1; - ctx.count = 0; - ctx.failed = c3n; - if ( c3n == u3_book_read(log_u, &ctx, 1, 100, _test_read_cb) ) { - fprintf(stderr, "book_tests: batch read failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); - if ( c3y == ctx.failed || 100 != ctx.count ) { - fprintf(stderr, "book_tests: batch read verify failed (failed=%u, count=%" PRIu64 ")\r\n", - ctx.failed, ctx.count); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( !txt_u ) { + fprintf(stderr, " empty_log: init failed\r\n"); + ret_i = 0; + goto cleanup; } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_persistence(): test closing and reopening log. -*/ -static c3_o -_test_book_persistence(void) -{ - c3_c* tmp_c = _test_tmpdir("book-persist"); - u3_book* log_u; - void* bufs[50]; - c3_z sizes[50]; - c3_d i, low_d, hig_d; - read_ctx ctx = {0}; - - if ( !tmp_c ) { - return c3n; - } + // test gulf on empty log + { + c3_d low_d, hig_d; + c3_o gul_o = u3_book_gulf(txt_u, &low_d, &hig_d); - // write 50 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( c3y != gul_o ) { + fprintf(stderr, " empty_log: gulf returned c3n\r\n"); + ret_i = 0; + } + // empty log should have fir_d=0, las_d=0 + if ( 0 != low_d || 0 != hig_d ) { + fprintf(stderr, " empty_log: gulf expected (0,0), got (%" PRIu64 ",%" PRIu64 ")\r\n", + low_d, hig_d); + ret_i = 0; + } } - for ( i = 0; i < 50; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } + // test read on empty log - should fail + { + _test_read_ctx ctx_u = {0}; + c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: persist save failed\r\n"); - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); + if ( c3n != red_o ) { + fprintf(stderr, " empty_log: read should fail on empty log\r\n"); + ret_i = 0; } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); } - u3_book_exit(log_u); + // test walk_init on empty log - should fail + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 1, 1); - // reopen and verify - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: persist reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 50 != hig_d ) { - fprintf(stderr, "book_tests: persist gulf wrong: [%" PRIu64 ", %" PRIu64 "]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( c3n != wlk_o ) { + fprintf(stderr, " empty_log: walk_init should fail on empty log\r\n"); + ret_i = 0; + } } - // read all events - ctx.expected_start = 1; - ctx.count = 0; - ctx.failed = c3n; - if ( c3n == u3_book_read(log_u, &ctx, 1, 50, _test_read_cb) ) { - fprintf(stderr, "book_tests: persist read failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); - if ( c3y == ctx.failed || 50 != ctx.count ) { - fprintf(stderr, "book_tests: persist verify failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " empty_log_operations: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_contiguity(): test that non-contiguous writes fail. +/* _test_single_event_lifecycle(): write, read, walk single event. */ -static c3_o -_test_book_contiguity(void) +static c3_i +_test_single_event_lifecycle(void) { - c3_c* tmp_c = _test_tmpdir("book-contig"); - u3_book* log_u; - c3_y* buf_y; - c3_z siz_z; - - if ( !tmp_c ) { - return c3n; - } - - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // write event 1 - _test_make_event(&buf_y, &siz_z, 1); - if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: contig save 1 failed\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - c3_free(buf_y); - - // try to write event 3 (should fail - gap) - _test_make_event(&buf_y, &siz_z, 3); - if ( c3y == u3_book_save(log_u, 3, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: contig should have failed for gap\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " single_event: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write single event (event #1, epoch 0) + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " single_event: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - c3_free(buf_y); - // write event 2 (should succeed) - _test_make_event(&buf_y, &siz_z, 2); - if ( c3n == u3_book_save(log_u, 2, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: contig save 2 failed\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // verify gulf + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( 1 != low_d || 1 != hig_d ) { + fprintf(stderr, " single_event: gulf expected (1,1), got (%" PRIu64 ",%" PRIu64 ")\r\n", + low_d, hig_d); + ret_i = 0; + } } - c3_free(buf_y); - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} -/* _test_book_partial_read(): test reading subset of events. -*/ -static c3_o -_test_book_partial_read(void) -{ - c3_c* tmp_c = _test_tmpdir("book-partial"); - u3_book* log_u; - void* bufs[100]; - c3_z sizes[100]; - c3_d i; - read_ctx ctx = {0}; - - if ( !tmp_c ) { - return c3n; - } + // read it back + { + _test_read_ctx ctx_u = {0}; + c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( c3n == red_o ) { + fprintf(stderr, " single_event: read failed\r\n"); + ret_i = 0; + } + else { + if ( ctx_u.eve_d != 1 ) { + fprintf(stderr, " single_event: read wrong event number\r\n"); + ret_i = 0; + } + if ( ctx_u.len_z != evt_z ) { + fprintf(stderr, " single_event: read wrong length\r\n"); + ret_i = 0; + } + if ( 0 != memcmp(ctx_u.buf_y, evt_y, evt_z) ) { + fprintf(stderr, " single_event: read data mismatch\r\n"); + ret_i = 0; + } + _free(ctx_u.buf_y); + } } - // write 100 events - for ( i = 0; i < 100; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } + // walk it + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 1, 1); - if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: partial save failed\r\n"); - for ( i = 0; i < 100; i++ ) { - c3_free(bufs[i]); + if ( c3n == wlk_o ) { + fprintf(stderr, " single_event: walk_init failed\r\n"); + ret_i = 0; } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + else { + c3_z len_z; + void* buf_v; + c3_o nex_o = u3_book_walk_next(&itr_u, &len_z, &buf_v); + + if ( c3n == nex_o ) { + fprintf(stderr, " single_event: walk_next failed\r\n"); + ret_i = 0; + } + else { + if ( len_z != evt_z ) { + fprintf(stderr, " single_event: walk wrong length\r\n"); + ret_i = 0; + } + _free(buf_v); + + // second call should return c3n (end of iteration) + nex_o = u3_book_walk_next(&itr_u, &len_z, &buf_v); + if ( c3y == nex_o ) { + fprintf(stderr, " single_event: walk should end after 1 event\r\n"); + ret_i = 0; + _free(buf_v); + } + } - for ( i = 0; i < 100; i++ ) { - c3_free(bufs[i]); + u3_book_walk_done(&itr_u); + } } - // read events 50-75 - ctx.expected_start = 50; - ctx.count = 0; - ctx.failed = c3n; - if ( c3n == u3_book_read(log_u, &ctx, 50, 26, _test_read_cb) ) { - fprintf(stderr, "book_tests: partial read failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); - if ( c3y == ctx.failed || 26 != ctx.count ) { - fprintf(stderr, "book_tests: partial verify failed: count=%" PRIu64 "\r\n", - ctx.count); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } +cleanup: + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " single_event_lifecycle: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_iterator(): test walk iterator pattern. +/* _test_epoch_boundary_validation(): first event must be epo_d + 1. */ -static c3_o -_test_book_iterator(void) +static c3_i +_test_epoch_boundary_validation(void) { - c3_c* tmp_c = _test_tmpdir("book-iter"); - u3_book* log_u; - u3_book_walk itr_u; - void* bufs[50]; - c3_z sizes[50]; - c3_d i; - c3_z len_z; - void* buf_v; - c3_d count = 0; - - if ( !tmp_c ) { - return c3n; - } - - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // write 50 events - for ( i = 0; i < 50; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: iter save failed\r\n"); - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " epoch_boundary: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + evt_y = _test_make_event(&evt_z, 5); + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + // try to write event 5 with epoch 0 - should fail (expects event 1) + { + c3_o sav_o = u3_book_save(txt_u, 5, 1, byt_p, siz_i, 0); + if ( c3y == sav_o ) { + fprintf(stderr, " epoch_boundary: should reject event 5 for epoch 0\r\n"); + ret_i = 0; } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; } - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); + // write event 5 with epoch 4 - should succeed (4 + 1 = 5) + { + c3_o sav_o = u3_book_save(txt_u, 5, 1, byt_p, siz_i, 4); + if ( c3n == sav_o ) { + fprintf(stderr, " epoch_boundary: should accept event 5 for epoch 4\r\n"); + ret_i = 0; + } } - // iterate events 10-30 - if ( c3n == u3_book_walk_init(log_u, &itr_u, 10, 30) ) { - fprintf(stderr, "book_tests: walk_init failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - while ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { - c3_d expected_eve = 10 + count; - - if ( c3n == _test_verify_event(expected_eve, len_z, buf_v) ) { - fprintf(stderr, "book_tests: iter verify failed at %" PRIu64 "\r\n", count); - c3_free(buf_v); - u3_book_walk_done(&itr_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - c3_free(buf_v); - count++; - } - - if ( 21 != count ) { // events 10-30 inclusive = 21 events - fprintf(stderr, "book_tests: iter count wrong: %" PRIu64 "\r\n", count); - u3_book_walk_done(&itr_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_walk_done(&itr_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* metadata callback context -*/ -typedef struct _meta_ctx { - c3_o found; - c3_z len_z; - c3_y buf_y[256]; -} meta_ctx; - -/* _test_meta_cb(): callback for u3_book_read_meta(). -*/ -static void -_test_meta_cb(void* ptr_v, c3_zs len_zs, void* val_v) -{ - meta_ctx* ctx = (meta_ctx*)ptr_v; - - if ( len_zs < 0 ) { - ctx->found = c3n; - ctx->len_z = 0; - return; - } - - ctx->found = c3y; - ctx->len_z = len_zs; - if ( len_zs > 0 && len_zs <= 256 ) { - memcpy(ctx->buf_y, val_v, len_zs); - } -} - -/* _test_book_metadata(): test metadata read/write operations. -*/ -static c3_o -_test_book_metadata(void) -{ - c3_c* tmp_c = _test_tmpdir("book-meta"); - u3_book* log_u; - meta_ctx ctx = {0}; - c3_w version = 1; - c3_y who[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; - c3_o fake = c3y; - - if ( !tmp_c ) { - return c3n; - } - - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // write metadata - if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { - fprintf(stderr, "book_tests: meta save version failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - if ( c3n == u3_book_save_meta(log_u, "who", sizeof(who), who) ) { - fprintf(stderr, "book_tests: meta save who failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - if ( c3n == u3_book_save_meta(log_u, "fake", sizeof(fake), &fake) ) { - fprintf(stderr, "book_tests: meta save fake failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // read metadata back - ctx.found = c3n; - u3_book_read_meta(log_u, &ctx, "version", _test_meta_cb); - if ( c3n == ctx.found || ctx.len_z != sizeof(version) ) { - fprintf(stderr, "book_tests: meta read version failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - if ( memcmp(ctx.buf_y, &version, sizeof(version)) != 0 ) { - fprintf(stderr, "book_tests: meta version mismatch\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - ctx.found = c3n; - u3_book_read_meta(log_u, &ctx, "who", _test_meta_cb); - if ( c3n == ctx.found || ctx.len_z != sizeof(who) ) { - fprintf(stderr, "book_tests: meta read who failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - if ( memcmp(ctx.buf_y, who, sizeof(who)) != 0 ) { - fprintf(stderr, "book_tests: meta who mismatch\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // read non-existent key - ctx.found = c3y; - u3_book_read_meta(log_u, &ctx, "nonexistent", _test_meta_cb); - if ( c3y == ctx.found ) { - fprintf(stderr, "book_tests: meta read nonexistent should fail\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // update existing key - version = 2; - if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { - fprintf(stderr, "book_tests: meta update version failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - ctx.found = c3n; - u3_book_read_meta(log_u, &ctx, "version", _test_meta_cb); - if ( c3n == ctx.found || ctx.len_z != sizeof(version) ) { - fprintf(stderr, "book_tests: meta read updated version failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - if ( memcmp(ctx.buf_y, &version, sizeof(version)) != 0 ) { - fprintf(stderr, "book_tests: meta updated version mismatch\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* failure mode tests -*/ - -/* _test_book_corrupt_header_magic(): test invalid magic number detection. -*/ -static c3_o -_test_book_corrupt_header_magic(void) -{ - c3_c* tmp_c = _test_tmpdir("book-corrupt-magic"); - u3_book* log_u; - void* bufs[10]; - c3_z sizes[10]; - c3_d i; - - if ( !tmp_c ) { - return c3n; - } - - // create log with 10 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 10; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: corrupt_header_magic save failed\r\n"); - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); - } - - u3_book_exit(log_u); - - // corrupt magic number - if ( c3n == _test_corrupt_magic(tmp_c, 0xDEADBEEF) ) { - fprintf(stderr, "book_tests: corrupt_header_magic corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // try to reopen - should fail - log_u = u3_book_init(tmp_c); - if ( log_u ) { - fprintf(stderr, "book_tests: corrupt_header_magic should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_corrupt_header_version(): test unsupported version detection. -*/ -static c3_o -_test_book_corrupt_header_version(void) -{ - c3_c* tmp_c = _test_tmpdir("book-corrupt-version"); - u3_book* log_u; - void* bufs[10]; - c3_z sizes[10]; - c3_d i; - - if ( !tmp_c ) { - return c3n; - } - - // create log with 10 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 10; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: corrupt_header_version save failed\r\n"); - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); - } - - u3_book_exit(log_u); - - // corrupt version - if ( c3n == _test_corrupt_version(tmp_c, 99) ) { - fprintf(stderr, "book_tests: corrupt_header_version corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // try to reopen - should fail - log_u = u3_book_init(tmp_c); - if ( log_u ) { - fprintf(stderr, "book_tests: corrupt_header_version should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_corrupt_deed_crc(): test CRC corruption detection and recovery. -*/ -static c3_o -_test_book_corrupt_deed_crc(void) -{ - c3_c* tmp_c = _test_tmpdir("book-corrupt-crc"); - u3_book* log_u; - void* bufs[50]; - c3_z sizes[50]; - c3_d i, low_d, hig_d; - read_ctx ctx = {0}; - - if ( !tmp_c ) { - return c3n; - } - - // create log with 50 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 50; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: corrupt_deed_crc save failed\r\n"); - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); - } - - u3_book_exit(log_u); - - // corrupt event 25's CRC - if ( c3n == _test_corrupt_event_crc(tmp_c, 25) ) { - fprintf(stderr, "book_tests: corrupt_deed_crc corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // reopen - should succeed with recovery - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: corrupt_deed_crc reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // verify recovery truncated to event 24 - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 24 != hig_d ) { - fprintf(stderr, "book_tests: corrupt_deed_crc gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 24]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // read events 1-24 should succeed - ctx.expected_start = 1; - ctx.count = 0; - ctx.failed = c3n; - if ( c3n == u3_book_read(log_u, &ctx, 1, 24, _test_read_cb) ) { - fprintf(stderr, "book_tests: corrupt_deed_crc read failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - if ( c3y == ctx.failed || 24 != ctx.count ) { - fprintf(stderr, "book_tests: corrupt_deed_crc read verify failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_corrupt_deed_length_mismatch(): test len_d != let_d detection. -*/ -static c3_o -_test_book_corrupt_deed_length_mismatch(void) -{ - c3_c* tmp_c = _test_tmpdir("book-corrupt-length"); - u3_book* log_u; - void* bufs[30]; - c3_z sizes[30]; - c3_d i, low_d, hig_d; - - if ( !tmp_c ) { - return c3n; - } - - // create log with 30 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 30; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 30, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: corrupt_deed_length save failed\r\n"); - for ( i = 0; i < 30; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 30; i++ ) { - c3_free(bufs[i]); - } - - u3_book_exit(log_u); - - // corrupt event 15's let_d field - if ( c3n == _test_corrupt_event_length_tail(tmp_c, 15, 99999) ) { - fprintf(stderr, "book_tests: corrupt_deed_length corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // reopen with recovery - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: corrupt_deed_length reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // verify recovery truncated to event 14 - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 14 != hig_d ) { - fprintf(stderr, "book_tests: corrupt_deed_length gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 14]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_truncated_deed_partial(): test partial deed detection. -*/ -static c3_o -_test_book_truncated_deed_partial(void) -{ - c3_c* tmp_c = _test_tmpdir("book-truncated"); - u3_book* log_u; - void* bufs[20]; - c3_z sizes[20]; - c3_d i, low_d, hig_d; - c3_w event20_off; - - if ( !tmp_c ) { - return c3n; - } - - // create log with 20 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 20; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 20, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: truncated_deed save failed\r\n"); - for ( i = 0; i < 20; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 20; i++ ) { - c3_free(bufs[i]); - } - - u3_book_exit(log_u); - - // calculate offset to event 20 - if ( c3n == _test_calculate_event_offset(tmp_c, 20, &event20_off) ) { - fprintf(stderr, "book_tests: truncated_deed offset calc failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // truncate in middle of event 20 - if ( c3n == _test_truncate_file(tmp_c, event20_off + 10) ) { - fprintf(stderr, "book_tests: truncated_deed truncate failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // reopen - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: truncated_deed reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // verify recovery removed partial event 20 - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 19 != hig_d ) { - fprintf(stderr, "book_tests: truncated_deed gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 19]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_multiple_corruptions(): verify recovery stops at first corruption. -*/ -static c3_o -_test_book_multiple_corruptions(void) -{ - c3_c* tmp_c = _test_tmpdir("book-multi-corrupt"); - u3_book* log_u; - void* bufs[100]; - c3_z sizes[100]; - c3_d i, low_d, hig_d; - - if ( !tmp_c ) { - return c3n; - } - - // create log with 100 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 100; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 100, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: multi_corrupt save failed\r\n"); - for ( i = 0; i < 100; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 100; i++ ) { - c3_free(bufs[i]); - } - - u3_book_exit(log_u); - - // corrupt event 30's CRC - if ( c3n == _test_corrupt_event_crc(tmp_c, 30) ) { - fprintf(stderr, "book_tests: multi_corrupt first corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // corrupt event 60's CRC - if ( c3n == _test_corrupt_event_crc(tmp_c, 60) ) { - fprintf(stderr, "book_tests: multi_corrupt second corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // reopen - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: multi_corrupt reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // verify recovery stopped at first corruption (event 30) - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 29 != hig_d ) { - fprintf(stderr, "book_tests: multi_corrupt gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [1, 29]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_corrupt_first_event(): corruption at first event empties log. -*/ -static c3_o -_test_book_corrupt_first_event(void) -{ - c3_c* tmp_c = _test_tmpdir("book-corrupt-first"); - u3_book* log_u; - void* bufs[50]; - c3_z sizes[50]; - c3_d i, low_d, hig_d; - - if ( !tmp_c ) { - return c3n; - } - - // create log with 50 events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 50; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: corrupt_first save failed\r\n"); - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); - } - - u3_book_exit(log_u); - - // corrupt event 1's CRC - if ( c3n == _test_corrupt_event_crc(tmp_c, 1) ) { - fprintf(stderr, "book_tests: corrupt_first corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // reopen - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: corrupt_first reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - // verify log is empty - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 0 != low_d || 0 != hig_d ) { - fprintf(stderr, "book_tests: corrupt_first gulf wrong: [%" PRIu64 ", %" PRIu64 "] expected [0, 0]\r\n", - low_d, hig_d); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} - -/* _test_book_file_too_small(): detect undersized file. -*/ -static c3_o -_test_book_file_too_small(void) -{ - c3_c* tmp_c = _test_tmpdir("book-too-small"); - c3_c path_c[8193]; - c3_i fid_i; - c3_y small_buf[32]; - u3_book* log_u; - - if ( !tmp_c ) { - return c3n; - } - - // manually create small file - _test_get_book_path(tmp_c, path_c, sizeof(path_c)); - - fid_i = c3_open(path_c, O_RDWR|O_CREAT, 0644); - if ( 0 > fid_i ) { - fprintf(stderr, "book_tests: file_too_small create failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - memset(small_buf, 0, sizeof(small_buf)); - if ( sizeof(small_buf) != write(fid_i, small_buf, sizeof(small_buf)) ) { - fprintf(stderr, "book_tests: file_too_small write failed\r\n"); - close(fid_i); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - close(fid_i); - - // try to init - should fail - log_u = u3_book_init(tmp_c); - if ( log_u ) { - fprintf(stderr, "book_tests: file_too_small should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; -} +cleanup: + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); -/* boundary condition tests -*/ + fprintf(stderr, " epoch_boundary_validation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} -/* _test_book_read_empty_log(): test reading from empty log. +/* _test_contiguity_gap_rejection(): reject non-contiguous events. */ -static c3_o -_test_book_read_empty_log(void) +static c3_i +_test_contiguity_gap_rejection(void) { - c3_c* tmp_c = _test_tmpdir("book-read-empty"); - u3_book* log_u; - read_ctx ctx = {0}; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - if ( !tmp_c ) { - return c3n; + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt1_y = 0; + c3_y* evt3_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " contiguity: init failed\r\n"); + ret_i = 0; + goto cleanup; } - // create empty log - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // write event 1 + evt1_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt1_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " contiguity: save event 1 failed\r\n"); + ret_i = 0; + goto cleanup; + } } - // try to read from empty log - should fail - ctx.expected_start = 1; - ctx.count = 0; - ctx.failed = c3n; - if ( c3y == u3_book_read(log_u, &ctx, 1, 1, _test_read_cb) ) { - fprintf(stderr, "book_tests: read_empty_log should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // try to write event 3 (skipping 2) - should fail + evt3_y = _test_make_event(&evt_z, 3); + { + void* byt_p[1] = { evt3_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 3, 1, byt_p, siz_i, 0); + if ( c3y == sav_o ) { + fprintf(stderr, " contiguity: should reject gap (event 3 after 1)\r\n"); + ret_i = 0; + } } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + u3_book_exit(txt_u); + +cleanup: + if ( evt1_y ) _free(evt1_y); + if ( evt3_y ) _free(evt3_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " contiguity_gap_rejection: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_read_beyond_range(): test reading beyond event range. +/* _test_minimum_event_size(): event with minimum size (just mug). */ -static c3_o -_test_book_read_beyond_range(void) +static c3_i +_test_minimum_event_size(void) { - c3_c* tmp_c = _test_tmpdir("book-read-beyond"); - u3_book* log_u; - void* bufs[10]; - c3_z sizes[10]; - c3_d i; - read_ctx ctx = {0}; - - if ( !tmp_c ) { - return c3n; - } + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - // create log with events 1-10 - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); - for ( i = 0; i < 10; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + if ( !txt_u ) { + fprintf(stderr, " min_event: init failed\r\n"); + ret_i = 0; + goto cleanup; } - if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: read_beyond_range save failed\r\n"); - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + // create minimum event: just 4 bytes (mug only, no jam) + c3_y evt_y[4] = { 0xDE, 0xAD, 0xBE, 0xEF }; + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { 4 }; - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " min_event: save failed\r\n"); + ret_i = 0; + goto cleanup; } - // try to read event 11 - should fail - if ( c3y == u3_book_read(log_u, &ctx, 11, 1, _test_read_cb) ) { - fprintf(stderr, "book_tests: read_beyond_range event 11 should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + // read it back + { + _test_read_ctx ctx_u = {0}; + c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); - // try to read events 5-15 - should fail (extends beyond) - if ( c3y == u3_book_read(log_u, &ctx, 5, 11, _test_read_cb) ) { - fprintf(stderr, "book_tests: read_beyond_range events 5-15 should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( c3n == red_o ) { + fprintf(stderr, " min_event: read failed\r\n"); + ret_i = 0; + } + else { + if ( ctx_u.len_z != 4 ) { + fprintf(stderr, " min_event: wrong length %" PRIu64 "\r\n", (c3_d)ctx_u.len_z); + ret_i = 0; + } + _free(ctx_u.buf_y); + } } - // try to read event 0 - should fail (before first) - if ( c3y == u3_book_read(log_u, &ctx, 0, 1, _test_read_cb) ) { - fprintf(stderr, "book_tests: read_beyond_range event 0 should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " minimum_event_size: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_iterator_invalid_ranges(): test iterator with invalid ranges. +//============================================================================== +// Crash Recovery & Corruption Tests +//============================================================================== + +/* _test_crc_corruption_detection(): flip bit in data, verify recovery truncates. +** +** This test verifies that CRC corruption is detected during recovery. +** After corrupting jam data and reopening, the log should be empty +** because the corrupted deed fails CRC validation. */ -static c3_o -_test_book_iterator_invalid_ranges(void) +static c3_i +_test_crc_corruption_detection(void) { - c3_c* tmp_c = _test_tmpdir("book-iter-invalid"); - u3_book* log_u; - u3_book_walk itr_u; - void* bufs[50]; - c3_z sizes[50]; - c3_d i; - - if ( !tmp_c ) { - return c3n; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " crc_corruption: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write event (evt_z = 12 bytes: 4 mug + 8 jam) + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " crc_corruption: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - // create log with events 1-50 - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); + txt_u = 0; - for ( i = 0; i < 50; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + // corrupt the CRC field directly to ensure CRC mismatch + // file layout: [header 16] [deed_head 12] [jam 8] [deed_tail 12] + // deed_tail: [crc_w 4] [let_d 8] + // crc_w is at offset: 16 + 12 + 8 = 36 + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + if ( c3n == _test_corrupt_file(path_c, 36) ) { + fprintf(stderr, " crc_corruption: corrupt_file failed\r\n"); + ret_i = 0; + goto cleanup; } - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: iter_invalid_ranges save failed\r\n"); - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // reopen - recovery should detect CRC mismatch and truncate + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " crc_corruption: reopen failed\r\n"); + ret_i = 0; + goto cleanup; } - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); - } + // after recovery, log should be empty (corrupted deed truncated) + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); - // try iterator [60, 70] - should fail (beyond range) - if ( c3y == u3_book_walk_init(log_u, &itr_u, 60, 70) ) { - fprintf(stderr, "book_tests: iter_invalid_ranges [60, 70] should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( hig_d != 0 ) { + fprintf(stderr, " crc_corruption: expected empty log after recovery, got hig=%" PRIu64 "\r\n", hig_d); + ret_i = 0; + } } - // try iterator [40, 30] - should fail (start > end) - if ( c3y == u3_book_walk_init(log_u, &itr_u, 40, 30) ) { - fprintf(stderr, "book_tests: iter_invalid_ranges [40, 30] should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); + txt_u = 0; - // try iterator [0, 10] - should fail (before first) - if ( c3y == u3_book_walk_init(log_u, &itr_u, 0, 10) ) { - fprintf(stderr, "book_tests: iter_invalid_ranges [0, 10] should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " crc_corruption_detection: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_write_first_wrong_epoch(): test first event must be epo_d + 1. +/* _test_truncated_file_recovery(): truncate mid-event, verify recovery. +** +** write two events, truncate file mid-second-event, reopen. +** recovery should find only the first complete event. */ -static c3_o -_test_book_write_first_wrong_epoch(void) +static c3_i +_test_truncated_file_recovery(void) { - c3_c* tmp_c = _test_tmpdir("book-wrong-epoch"); - u3_book* log_u; - c3_y* buf_y; - c3_z siz_z; - - if ( !tmp_c ) { - return c3n; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt1_y = 0; + c3_y* evt2_y = 0; + c3_z evt_z; + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " truncated_file: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write two events (each evt_z = 12 bytes: 4 mug + 8 jam) + // deed size on disk = 12 (head) + 8 (jam) + 12 (tail) = 32 bytes + evt1_y = _test_make_event(&evt_z, 1); + evt2_y = _test_make_event(&evt_z, 2); + { + void* byt_p[2] = { evt1_y, evt2_y }; + c3_z siz_i[2] = { evt_z, evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 2, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " truncated_file: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); + txt_u = 0; - // try to save event 5 with epo_d=0 - should fail (expected event 1) - _test_make_event(&buf_y, &siz_z, 5); - if ( c3y == u3_book_save(log_u, 5, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: wrong_epoch event 5 with epo 0 should have failed\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // file layout: [header 16] [deed1] [deed2] + // deed size = sizeof(deed_head) + (len_d - 4) + sizeof(deed_tail) + // with struct padding, this is typically 40 bytes per deed for our 12-byte events + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + c3_d siz_d = _test_file_size(path_c); + + // calculate deed size dynamically: total - header = 2 deeds + c3_d deed_size = (siz_d - 16) / 2; + + // truncate to: header + deed1 + 5 bytes of deed2 + c3_d truncate_at = 16 + deed_size + 5; + + if ( c3n == _test_truncate_file(path_c, truncate_at) ) { + fprintf(stderr, " truncated_file: truncate failed\r\n"); + ret_i = 0; + goto cleanup; } - c3_free(buf_y); - // try to save event 1 with epo_d=5 - should fail (expected event 6) - _test_make_event(&buf_y, &siz_z, 1); - if ( c3y == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 5) ) { - fprintf(stderr, "book_tests: wrong_epoch event 1 with epo 5 should have failed\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // reopen - recovery should find deed1 valid, deed2 truncated + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " truncated_file: reopen failed\r\n"); + ret_i = 0; + goto cleanup; } - c3_free(buf_y); - // save event 1 with epo_d=0 - should succeed - _test_make_event(&buf_y, &siz_z, 1); - if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: wrong_epoch event 1 with epo 0 failed\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // verify only event 1 exists + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( hig_d != 1 ) { + fprintf(stderr, " truncated_file: expected hig=1, got %" PRIu64 "\r\n", hig_d); + ret_i = 0; + } } - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + u3_book_exit(txt_u); + txt_u = 0; + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( evt1_y ) _free(evt1_y); + if ( evt2_y ) _free(evt2_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " truncated_file_recovery: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_large_event_cb(): callback for large event test. +/* _test_garbage_after_valid_deeds(): append garbage, verify recovery stops. +** +** write a valid event, then append garbage bytes that form an invalid +** deed structure. recovery should preserve the valid event and truncate +** the garbage. +** +** note: we append a small, controlled garbage pattern to avoid triggering +** huge allocation attempts from random let_d values. */ -static c3_o -_test_large_event_cb(void* ptr_v, c3_d eve_d, c3_z siz_z, void* buf_v) +static c3_i +_test_garbage_after_valid_deeds(void) { - c3_z* expected_size = (c3_z*)ptr_v; - - if ( 1 != eve_d ) { - fprintf(stderr, "book_tests: large_event_cb wrong event: %" PRIu64 "\r\n", eve_d); - return c3n; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " garbage_after: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write one valid event + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " garbage_after: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - if ( *expected_size != siz_z ) { - fprintf(stderr, "book_tests: large_event_cb size mismatch: %zu vs %zu\r\n", - siz_z, *expected_size); - return c3n; - } + u3_book_exit(txt_u); + txt_u = 0; - return c3y; -} + // append garbage with a zero let_d trailer to prevent huge allocations + // the reverse scan reads let_d from the last 8 bytes; if let_d == 0, + // scan_back breaks and falls through to scan_end + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + { + c3_i fid_i = open(path_c, O_WRONLY | O_APPEND); + if ( fid_i < 0 ) { + fprintf(stderr, " garbage_after: open failed\r\n"); + ret_i = 0; + goto cleanup; + } -/* _test_book_very_large_event(): test large event handling. -*/ -static c3_o -_test_book_very_large_event(void) -{ - c3_c* tmp_c = _test_tmpdir("book-large-event"); - u3_book* log_u; - c3_y* buf_y; - c3_z siz_z; - c3_z large_size = 1024 * 1024; // 1 MB event - c3_w mug_w = 12345; - c3_z i; - - if ( !tmp_c ) { - return c3n; + // 12 bytes of garbage that won't form valid let_d + // set last 8 bytes to 0 so let_d == 0 triggers scan_back failure + c3_y garbage[12] = { 0xDE, 0xAD, 0xBE, 0xEF, 0, 0, 0, 0, 0, 0, 0, 0 }; + write(fid_i, garbage, sizeof(garbage)); + close(fid_i); } - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // reopen - should recover to just event 1 + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " garbage_after: reopen failed\r\n"); + ret_i = 0; + goto cleanup; } - // create large event: 4-byte mug + (large_size - 4) jam data - siz_z = large_size; - buf_y = c3_malloc(siz_z); + // verify event 1 is still readable + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); - memcpy(buf_y, &mug_w, 4); - for ( i = 4; i < siz_z; i++ ) { - buf_y[i] = (c3_y)(i & 0xff); + if ( hig_d != 1 ) { + fprintf(stderr, " garbage_after: expected hig=1, got %" PRIu64 "\r\n", hig_d); + ret_i = 0; + } } - // save large event - if ( c3n == u3_book_save(log_u, 1, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: very_large_event save failed\r\n"); - c3_free(buf_y); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // read should succeed + { + _test_read_ctx ctx_u = {0}; + c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); + + if ( c3n == red_o ) { + fprintf(stderr, " garbage_after: read failed\r\n"); + ret_i = 0; + } + else { + _free(ctx_u.buf_y); + } } - c3_free(buf_y); + u3_book_exit(txt_u); + txt_u = 0; - // read back and verify size matches - if ( c3n == u3_book_read(log_u, &large_size, 1, 1, _test_large_event_cb) ) { - fprintf(stderr, "book_tests: very_large_event read failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " garbage_after_valid_deeds: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* metadata edge case tests -*/ - -/* _test_book_metadata_section_full(): test 256-byte metadata limit. +/* _test_length_trailer_mismatch(): craft deed with len_d != let_d. */ -static c3_o -_test_book_metadata_section_full(void) +static c3_i +_test_length_trailer_mismatch(void) { - c3_c* tmp_c = _test_tmpdir("book-meta-full"); - u3_book* log_u; - c3_y data[4]; - - if ( !tmp_c ) { - return c3n; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " len_mismatch: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write event + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " len_mismatch: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + u3_book_exit(txt_u); + txt_u = 0; + + // corrupt the let_d field (last 8 bytes of deed) + // deed ends at: 16 (header) + 12 (deed_head) + (evt_z-4) (jam) + 12 (deed_tail) + // let_d is at offset: deed_end - 8 + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + c3_d deed_end = 16 + 12 + (evt_z - 4) + 12; + c3_d let_off = deed_end - 8; + + // write a different value for let_d + c3_d bad_let = 0x12345678; + if ( c3n == _test_write_raw(path_c, let_off, &bad_let, sizeof(bad_let)) ) { + fprintf(stderr, " len_mismatch: write_raw failed\r\n"); + ret_i = 0; + goto cleanup; } - // try to save the four fixed keys we support - memset(data, 0xAB, sizeof(data)); - - // version (4 bytes) - if ( c3n == u3_book_save_meta(log_u, "version", 4, data) ) { - fprintf(stderr, "book_tests: meta_section_full version save failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // reopen - should recover to empty (no valid events) + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " len_mismatch: reopen failed\r\n"); + ret_i = 0; + goto cleanup; } - // unknown key should fail - if ( c3y == u3_book_save_meta(log_u, "unknown", 4, data) ) { - fprintf(stderr, "book_tests: meta_section_full unknown key should have failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // verify no events (mismatch detected, truncated) + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( hig_d != 0 ) { + fprintf(stderr, " len_mismatch: expected empty log, got hig=%" PRIu64 "\r\n", hig_d); + ret_i = 0; + } } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + u3_book_exit(txt_u); + txt_u = 0; + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " length_trailer_mismatch: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_metadata_corrupted_count(): test corrupted metadata handling. +//============================================================================== +// Iterator Tests +//============================================================================== + +/* _test_walk_single_event(): walk range of exactly 1 event. */ -static c3_o -_test_book_metadata_corrupted_count(void) +static c3_i +_test_walk_single_event(void) { - c3_c* tmp_c = _test_tmpdir("book-meta-corrupt"); - u3_book* log_u; - c3_w version = 1; - void* bufs[10]; - c3_z sizes[10]; - c3_d i, low_d, hig_d; - c3_c path_c[8193]; - c3_i fid_i; - c3_w bad_count = 999; - - if ( !tmp_c ) { - return c3n; - } + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - // create log with metadata and events - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y[3] = {0}; + c3_z evt_z; - // add metadata - if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { - fprintf(stderr, "book_tests: meta_corrupted save_meta failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( !txt_u ) { + fprintf(stderr, " walk_single: init failed\r\n"); + ret_i = 0; + goto cleanup; } - // add events - for ( i = 0; i < 10; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } + // write 3 events + { + void* byt_p[3]; + c3_z siz_i[3]; - if ( c3n == u3_book_save(log_u, 1, 10, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: meta_corrupted save failed\r\n"); - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); + for ( int i = 0; i < 3; i++ ) { + evt_y[i] = _test_make_event(&evt_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = evt_z; } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - for ( i = 0; i < 10; i++ ) { - c3_free(bufs[i]); + c3_o sav_o = u3_book_save(txt_u, 1, 3, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " walk_single: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - u3_book_exit(log_u); + // walk just event 2 + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 2, 2); - // corrupt metadata count field (at offset 64) - _test_get_book_path(tmp_c, path_c, sizeof(path_c)); - fid_i = c3_open(path_c, O_RDWR, 0); - if ( 0 > fid_i ) { - fprintf(stderr, "book_tests: meta_corrupted open failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + if ( c3n == wlk_o ) { + fprintf(stderr, " walk_single: walk_init failed\r\n"); + ret_i = 0; + } + else { + c3_z len_z; + void* buf_v; + c3_i count = 0; - pwrite(fid_i, &bad_count, sizeof(c3_w), 64); - c3_sync(fid_i); - close(fid_i); + while ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { + count++; + _free(buf_v); + } - // reopen - should succeed (metadata corruption shouldn't prevent init) - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: meta_corrupted reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( count != 1 ) { + fprintf(stderr, " walk_single: expected 1 event, got %d\r\n", count); + ret_i = 0; + } + + u3_book_walk_done(&itr_u); + } } - // events should still be readable - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 1 != low_d || 10 != hig_d ) { - fprintf(stderr, "book_tests: meta_corrupted events lost\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + u3_book_exit(txt_u); + +cleanup: + for ( int i = 0; i < 3; i++ ) { + if ( evt_y[i] ) _free(evt_y[i]); } + _test_rm_rf(dir_c); + _free(dir_c); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " walk_single_event: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_metadata_empty_key(): test unknown key edge case. +/* _test_walk_invalidation(): walk_done then walk_next should fail. */ -static c3_o -_test_book_metadata_empty_key(void) +static c3_i +_test_walk_invalidation(void) { - c3_c* tmp_c = _test_tmpdir("book-meta-empty"); - u3_book* log_u; - c3_w val = 42; - - if ( !tmp_c ) { - return c3n; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " walk_invalid: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write event + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " walk_invalid: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + // init walk, then done, then try next + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 1, 1); + + if ( c3n == wlk_o ) { + fprintf(stderr, " walk_invalid: walk_init failed\r\n"); + ret_i = 0; + } + else { + u3_book_walk_done(&itr_u); + + c3_z len_z; + void* buf_v; + c3_o nex_o = u3_book_walk_next(&itr_u, &len_z, &buf_v); - // try to save with unknown key - should fail - if ( c3n == u3_book_save_meta(log_u, "unknown_key", sizeof(val), &val) ) { - // unknown key rejected - expected behavior - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + if ( c3y == nex_o ) { + fprintf(stderr, " walk_invalid: walk_next should fail after done\r\n"); + ret_i = 0; + _free(buf_v); + } + } } - // unknown key accepted - that's ok, just verify it doesn't crash - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + u3_book_exit(txt_u); + +cleanup: + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " walk_invalidation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } - -/* _test_book_metadata_persistence(): test metadata survives corruption recovery. + +/* _test_walk_range_validation(): invalid ranges should fail. */ -static c3_o -_test_book_metadata_persistence(void) +static c3_i +_test_walk_range_validation(void) { - c3_c* tmp_c = _test_tmpdir("book-meta-persist"); - u3_book* log_u; - c3_w version = 1; - void* bufs[20]; - c3_z sizes[20]; - c3_d i, low_d, hig_d; - meta_ctx ctx = {0}; - - if ( !tmp_c ) { - return c3n; - } + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - // create log with metadata - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y[3] = {0}; + c3_z evt_z; - if ( c3n == u3_book_save_meta(log_u, "version", sizeof(version), &version) ) { - fprintf(stderr, "book_tests: meta_persistence save_meta failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + if ( !txt_u ) { + fprintf(stderr, " walk_range: init failed\r\n"); + ret_i = 0; + goto cleanup; } - // add events - for ( i = 0; i < 20; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } + // write 3 events + { + void* byt_p[3]; + c3_z siz_i[3]; - if ( c3n == u3_book_save(log_u, 1, 20, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: meta_persistence save failed\r\n"); - for ( i = 0; i < 20; i++ ) { - c3_free(bufs[i]); + for ( int i = 0; i < 3; i++ ) { + evt_y[i] = _test_make_event(&evt_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = evt_z; } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - for ( i = 0; i < 20; i++ ) { - c3_free(bufs[i]); + c3_o sav_o = u3_book_save(txt_u, 1, 3, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " walk_range: save failed\r\n"); + ret_i = 0; + goto cleanup; + } } - u3_book_exit(log_u); + // try invalid ranges + { + u3_book_walk itr_u; - // corrupt last event - if ( c3n == _test_corrupt_event_crc(tmp_c, 20) ) { - fprintf(stderr, "book_tests: meta_persistence corruption failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + // nex > las (inverted range) + if ( c3y == u3_book_walk_init(txt_u, &itr_u, 3, 1) ) { + fprintf(stderr, " walk_range: should reject nex > las\r\n"); + ret_i = 0; + } - // reopen with recovery - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - fprintf(stderr, "book_tests: meta_persistence reopen failed\r\n"); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + // las beyond log end + if ( c3y == u3_book_walk_init(txt_u, &itr_u, 1, 100) ) { + fprintf(stderr, " walk_range: should reject las > log end\r\n"); + ret_i = 0; + } - // verify recovery happened - u3_book_gulf(log_u, &low_d, &hig_d); - if ( 19 != hig_d ) { - fprintf(stderr, "book_tests: meta_persistence recovery didn't happen\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // nex before log start (fir_d is 1) + if ( c3y == u3_book_walk_init(txt_u, &itr_u, 0, 1) ) { + fprintf(stderr, " walk_range: should reject nex < fir_d\r\n"); + ret_i = 0; + } } - // verify metadata still readable - u3_book_read_meta(log_u, &ctx, "version", _test_meta_cb); - if ( c3n == ctx.found ) { - fprintf(stderr, "book_tests: meta_persistence metadata lost\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + u3_book_exit(txt_u); + +cleanup: + for ( int i = 0; i < 3; i++ ) { + if ( evt_y[i] ) _free(evt_y[i]); } + _test_rm_rf(dir_c); + _free(dir_c); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " walk_range_validation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* invalid operation tests +/* _test_invalid_magic(): file with wrong magic number should be rejected. */ - -/* _test_book_null_handle(): test NULL handle checks. -*/ -static c3_o -_test_book_null_handle(void) +static c3_i +_test_invalid_magic(void) { - c3_d low_d, hig_d; - read_ctx ctx = {0}; - c3_y* buf_y; - c3_z siz_z; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - // test gulf with NULL - if ( c3y == u3_book_gulf(NULL, &low_d, &hig_d) ) { - fprintf(stderr, "book_tests: null_handle gulf should have failed\r\n"); - return c3n; + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " invalid_magic: init failed\r\n"); + ret_i = 0; + goto cleanup; } - // test read with NULL - if ( c3y == u3_book_read(NULL, &ctx, 1, 1, _test_read_cb) ) { - fprintf(stderr, "book_tests: null_handle read should have failed\r\n"); - return c3n; + u3_book_exit(txt_u); + txt_u = 0; + + // corrupt magic number at offset 0 + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + c3_w bad_magic = 0xDEADBEEF; + if ( c3n == _test_write_raw(path_c, 0, &bad_magic, sizeof(bad_magic)) ) { + fprintf(stderr, " invalid_magic: write_raw failed\r\n"); + ret_i = 0; + goto cleanup; } - // test save with NULL - _test_make_event(&buf_y, &siz_z, 1); - if ( c3y == u3_book_save(NULL, 1, 1, (void**)&buf_y, &siz_z, 0) ) { - fprintf(stderr, "book_tests: null_handle save should have failed\r\n"); - c3_free(buf_y); - return c3n; + // reopen should fail + txt_u = u3_book_init(dir_c); + if ( txt_u ) { + fprintf(stderr, " invalid_magic: should reject bad magic\r\n"); + ret_i = 0; + u3_book_exit(txt_u); + txt_u = 0; } - c3_free(buf_y); - return c3y; +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " invalid_magic: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_iterator_after_done(): test closed iterator. +/* _test_invalid_version(): file with wrong version should be rejected. */ -static c3_o -_test_book_iterator_after_done(void) +static c3_i +_test_invalid_version(void) { - c3_c* tmp_c = _test_tmpdir("book-iter-done"); - u3_book* log_u; - u3_book_walk itr_u; - void* bufs[20]; - c3_z sizes[20]; - c3_d i; - c3_z len_z; - void* buf_v; - - if ( !tmp_c ) { - return c3n; - } + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - // create log with events 1-20 - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_c path_c[8192]; - for ( i = 0; i < 20; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); + if ( !txt_u ) { + fprintf(stderr, " invalid_version: init failed\r\n"); + ret_i = 0; + goto cleanup; } - if ( c3n == u3_book_save(log_u, 1, 20, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: iter_after_done save failed\r\n"); - for ( i = 0; i < 20; i++ ) { - c3_free(bufs[i]); - } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } + u3_book_exit(txt_u); + txt_u = 0; - for ( i = 0; i < 20; i++ ) { - c3_free(bufs[i]); + // corrupt version at offset 4 + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + c3_w bad_version = 99; + if ( c3n == _test_write_raw(path_c, 4, &bad_version, sizeof(bad_version)) ) { + fprintf(stderr, " invalid_version: write_raw failed\r\n"); + ret_i = 0; + goto cleanup; } - // create iterator - if ( c3n == u3_book_walk_init(log_u, &itr_u, 1, 20) ) { - fprintf(stderr, "book_tests: iter_after_done walk_init failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // reopen should fail + txt_u = u3_book_init(dir_c); + if ( txt_u ) { + fprintf(stderr, " invalid_version: should reject bad version\r\n"); + ret_i = 0; + u3_book_exit(txt_u); + txt_u = 0; } - // close iterator - u3_book_walk_done(&itr_u); +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + _test_rm_rf(dir_c); + _free(dir_c); - // try to use closed iterator - should fail - if ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { - fprintf(stderr, "book_tests: iter_after_done walk_next should have failed\r\n"); - c3_free(buf_v); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " invalid_version: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_iterator_concurrent_modification(): test iterator after log modification. +/* _test_header_only_file(): file with just header should init as empty. */ -static c3_o -_test_book_iterator_concurrent_modification(void) +static c3_i +_test_header_only_file(void) { - c3_c* tmp_c = _test_tmpdir("book-iter-concurrent"); - u3_book* log_u; - u3_book_walk itr_u; - void* bufs[70]; - c3_z sizes[70]; - c3_d i; - c3_z len_z; - void* buf_v; - c3_d count = 0; - - if ( !tmp_c ) { - return c3n; - } - - // create log with events 1-50 - log_u = u3_book_init(tmp_c); - if ( !log_u ) { - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - - for ( i = 0; i < 50; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } - - if ( c3n == u3_book_save(log_u, 1, 50, bufs, sizes, 0) ) { - fprintf(stderr, "book_tests: iter_concurrent save failed\r\n"); - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " header_only: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write event then truncate to header only + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " header_only: save failed\r\n"); + ret_i = 0; + goto cleanup; } - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; } - for ( i = 0; i < 50; i++ ) { - c3_free(bufs[i]); - } + u3_book_exit(txt_u); + txt_u = 0; - // create iterator for events 10-30 - if ( c3n == u3_book_walk_init(log_u, &itr_u, 10, 30) ) { - fprintf(stderr, "book_tests: iter_concurrent walk_init failed\r\n"); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; + // truncate to just header (16 bytes) + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + if ( c3n == _test_truncate_file(path_c, 16) ) { + fprintf(stderr, " header_only: truncate failed\r\n"); + ret_i = 0; + goto cleanup; } - // read a few events - for ( count = 0; count < 5; count++ ) { - if ( c3n == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { - fprintf(stderr, "book_tests: iter_concurrent walk_next failed\r\n"); - u3_book_walk_done(&itr_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } - c3_free(buf_v); + // reopen should succeed with empty log + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " header_only: reopen failed\r\n"); + ret_i = 0; + goto cleanup; } - // add new events 51-60 - for ( i = 50; i < 60; i++ ) { - _test_make_event((c3_y**)&bufs[i], &sizes[i], i + 1); - } + // verify empty + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); - if ( c3n == u3_book_save(log_u, 51, 10, &bufs[50], &sizes[50], 0) ) { - fprintf(stderr, "book_tests: iter_concurrent second save failed\r\n"); - for ( i = 50; i < 60; i++ ) { - c3_free(bufs[i]); + if ( hig_d != 0 ) { + fprintf(stderr, " header_only: expected empty, got hig=%" PRIu64 "\r\n", hig_d); + ret_i = 0; } - u3_book_walk_done(&itr_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; } - for ( i = 50; i < 60; i++ ) { - c3_free(bufs[i]); - } - - // continue iterating - should continue with original range - while ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { - c3_free(buf_v); - count++; - } + u3_book_exit(txt_u); + txt_u = 0; - // verify we read the expected range (10-30 = 21 events, already read 5) - if ( 21 != count ) { - fprintf(stderr, "book_tests: iter_concurrent count wrong: %" PRIu64 "\r\n", count); - u3_book_walk_done(&itr_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3n; - } +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); - u3_book_walk_done(&itr_u); - u3_book_exit(log_u); - _test_cleanup(tmp_c); - c3_free(tmp_c); - return c3y; + fprintf(stderr, " header_only_file: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* _test_book_core(): run all core book tests. +/* _test_undersized_file(): file smaller than header should be rejected. */ -static c3_o -_test_book_core(void) +static c3_i +_test_undersized_file(void) { - c3_o ret = c3y; + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - if ( c3n == _test_book_init_empty() ) { - fprintf(stderr, "book_tests: init_empty failed\r\n"); - ret = c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_c path_c[8192]; - if ( c3n == _test_book_single_event() ) { - fprintf(stderr, "book_tests: single_event failed\r\n"); - ret = c3n; + if ( !txt_u ) { + fprintf(stderr, " undersized: init failed\r\n"); + ret_i = 0; + goto cleanup; } - if ( c3n == _test_book_batch_write() ) { - fprintf(stderr, "book_tests: batch_write failed\r\n"); - ret = c3n; - } + u3_book_exit(txt_u); + txt_u = 0; - if ( c3n == _test_book_persistence() ) { - fprintf(stderr, "book_tests: persistence failed\r\n"); - ret = c3n; + // truncate to 8 bytes (less than 16-byte header) + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + if ( c3n == _test_truncate_file(path_c, 8) ) { + fprintf(stderr, " undersized: truncate failed\r\n"); + ret_i = 0; + goto cleanup; } - if ( c3n == _test_book_contiguity() ) { - fprintf(stderr, "book_tests: contiguity failed\r\n"); - ret = c3n; + // reopen should fail + txt_u = u3_book_init(dir_c); + if ( txt_u ) { + fprintf(stderr, " undersized: should reject undersized file\r\n"); + ret_i = 0; + u3_book_exit(txt_u); + txt_u = 0; } - if ( c3n == _test_book_partial_read() ) { - fprintf(stderr, "book_tests: partial_read failed\r\n"); - ret = c3n; - } +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + _test_rm_rf(dir_c); + _free(dir_c); - if ( c3n == _test_book_iterator() ) { - fprintf(stderr, "book_tests: iterator failed\r\n"); - ret = c3n; - } + fprintf(stderr, " undersized_file: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} - if ( c3n == _test_book_metadata() ) { - fprintf(stderr, "book_tests: metadata failed\r\n"); - ret = c3n; - } +/* _test_metadata_roundtrip(): save and read all metadata fields. +*/ +static c3_i +_test_metadata_roundtrip(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - // file corruption tests - if ( c3n == _test_book_corrupt_header_magic() ) { - fprintf(stderr, "book_tests: corrupt_header_magic failed\r\n"); - ret = c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); - if ( c3n == _test_book_corrupt_header_version() ) { - fprintf(stderr, "book_tests: corrupt_header_version failed\r\n"); - ret = c3n; + if ( !txt_u ) { + fprintf(stderr, " meta_roundtrip: init failed\r\n"); + ret_i = 0; + goto cleanup; } - if ( c3n == _test_book_corrupt_deed_crc() ) { - fprintf(stderr, "book_tests: corrupt_deed_crc failed\r\n"); - ret = c3n; + // test "version" field + { + c3_w ver_w = 42; + c3_o sav_o = u3_book_save_meta(txt_u, "version", sizeof(c3_w), &ver_w); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save version failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "version", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(c3_w) ) { + fprintf(stderr, " meta_roundtrip: read version wrong size\r\n"); + ret_i = 0; + } + else { + c3_w got_w; + memcpy(&got_w, ctx_u.buf_y, sizeof(c3_w)); + if ( got_w != 42 ) { + fprintf(stderr, " meta_roundtrip: version mismatch\r\n"); + ret_i = 0; + } + } + } } - if ( c3n == _test_book_corrupt_deed_length_mismatch() ) { - fprintf(stderr, "book_tests: corrupt_deed_length_mismatch failed\r\n"); - ret = c3n; + // test "who" field (16 bytes) + { + c3_d who_d[2] = { 0x123456789ABCDEF0, 0xFEDCBA9876543210 }; + c3_o sav_o = u3_book_save_meta(txt_u, "who", sizeof(who_d), who_d); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save who failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "who", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(who_d) ) { + fprintf(stderr, " meta_roundtrip: read who wrong size\r\n"); + ret_i = 0; + } + else { + c3_d got_d[2]; + memcpy(got_d, ctx_u.buf_y, sizeof(got_d)); + if ( got_d[0] != who_d[0] || got_d[1] != who_d[1] ) { + fprintf(stderr, " meta_roundtrip: who mismatch\r\n"); + ret_i = 0; + } + } + } } - if ( c3n == _test_book_truncated_deed_partial() ) { - fprintf(stderr, "book_tests: truncated_deed_partial failed\r\n"); - ret = c3n; + // test "fake" field (1 byte) + { + c3_o fak_o = c3y; + c3_o sav_o = u3_book_save_meta(txt_u, "fake", sizeof(c3_o), &fak_o); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save fake failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "fake", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(c3_o) ) { + fprintf(stderr, " meta_roundtrip: read fake wrong size\r\n"); + ret_i = 0; + } + else { + c3_o got_o; + memcpy(&got_o, ctx_u.buf_y, sizeof(c3_o)); + if ( got_o != c3y ) { + fprintf(stderr, " meta_roundtrip: fake mismatch\r\n"); + ret_i = 0; + } + } + } } - if ( c3n == _test_book_multiple_corruptions() ) { - fprintf(stderr, "book_tests: multiple_corruptions failed\r\n"); - ret = c3n; + // test "life" field + { + c3_w lif_w = 1234; + c3_o sav_o = u3_book_save_meta(txt_u, "life", sizeof(c3_w), &lif_w); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save life failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "life", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(c3_w) ) { + fprintf(stderr, " meta_roundtrip: read life wrong size\r\n"); + ret_i = 0; + } + else { + c3_w got_w; + memcpy(&got_w, ctx_u.buf_y, sizeof(c3_w)); + if ( got_w != 1234 ) { + fprintf(stderr, " meta_roundtrip: life mismatch\r\n"); + ret_i = 0; + } + } + } } - if ( c3n == _test_book_corrupt_first_event() ) { - fprintf(stderr, "book_tests: corrupt_first_event failed\r\n"); - ret = c3n; - } + u3_book_exit(txt_u); - if ( c3n == _test_book_file_too_small() ) { - fprintf(stderr, "book_tests: file_too_small failed\r\n"); - ret = c3n; - } +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); - // boundary condition tests - if ( c3n == _test_book_read_empty_log() ) { - fprintf(stderr, "book_tests: read_empty_log failed\r\n"); - ret = c3n; - } + fprintf(stderr, " metadata_roundtrip: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} - if ( c3n == _test_book_read_beyond_range() ) { - fprintf(stderr, "book_tests: read_beyond_range failed\r\n"); - ret = c3n; - } +/* _test_metadata_invalid_key(): unknown key should return -1. +*/ +static c3_i +_test_metadata_invalid_key(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; - if ( c3n == _test_book_iterator_invalid_ranges() ) { - fprintf(stderr, "book_tests: iterator_invalid_ranges failed\r\n"); - ret = c3n; - } + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); - if ( c3n == _test_book_write_first_wrong_epoch() ) { - fprintf(stderr, "book_tests: write_first_wrong_epoch failed\r\n"); - ret = c3n; + if ( !txt_u ) { + fprintf(stderr, " meta_invalid: init failed\r\n"); + ret_i = 0; + goto cleanup; } - if ( c3n == _test_book_very_large_event() ) { - fprintf(stderr, "book_tests: very_large_event failed\r\n"); - ret = c3n; + // read unknown key + { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "nonexistent", _test_meta_cb); + if ( ctx_u.siz_zs != -1 ) { + fprintf(stderr, " meta_invalid: should return -1 for unknown key\r\n"); + ret_i = 0; + } } - // metadata edge case tests - if ( c3n == _test_book_metadata_section_full() ) { - fprintf(stderr, "book_tests: metadata_section_full failed\r\n"); - ret = c3n; + // write unknown key + { + c3_w val_w = 42; + c3_o sav_o = u3_book_save_meta(txt_u, "nonexistent", sizeof(val_w), &val_w); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_invalid: should reject unknown key\r\n"); + ret_i = 0; + } } - if ( c3n == _test_book_metadata_corrupted_count() ) { - fprintf(stderr, "book_tests: metadata_corrupted_count failed\r\n"); - ret = c3n; - } + u3_book_exit(txt_u); - if ( c3n == _test_book_metadata_empty_key() ) { - fprintf(stderr, "book_tests: metadata_empty_key failed\r\n"); - ret = c3n; - } +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); - if ( c3n == _test_book_metadata_persistence() ) { - fprintf(stderr, "book_tests: metadata_persistence failed\r\n"); - ret = c3n; + fprintf(stderr, " metadata_invalid_key: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_metadata_size_validation(): wrong-sized values should be rejected. +*/ +static c3_i +_test_metadata_size_validation(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " meta_size: init failed\r\n"); + ret_i = 0; + goto cleanup; } - // invalid operation tests - if ( c3n == _test_book_null_handle() ) { - fprintf(stderr, "book_tests: null_handle failed\r\n"); - ret = c3n; + // try to write 2 bytes to "version" (expects 4) + { + c3_y buf_y[2] = { 0x12, 0x34 }; + c3_o sav_o = u3_book_save_meta(txt_u, "version", 2, buf_y); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_size: should reject wrong size for version\r\n"); + ret_i = 0; + } } - if ( c3n == _test_book_iterator_after_done() ) { - fprintf(stderr, "book_tests: iterator_after_done failed\r\n"); - ret = c3n; + // try to write 4 bytes to "who" (expects 16) + { + c3_w val_w = 42; + c3_o sav_o = u3_book_save_meta(txt_u, "who", sizeof(val_w), &val_w); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_size: should reject wrong size for who\r\n"); + ret_i = 0; + } } - if ( c3n == _test_book_iterator_concurrent_modification() ) { - fprintf(stderr, "book_tests: iterator_concurrent_modification failed\r\n"); - ret = c3n; + // try to write 4 bytes to "fake" (expects 1) + { + c3_w val_w = 1; + c3_o sav_o = u3_book_save_meta(txt_u, "fake", sizeof(val_w), &val_w); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_size: should reject wrong size for fake\r\n"); + ret_i = 0; + } } - return ret; + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " metadata_size_validation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; } -/* main -*/ +//============================================================================== +// Main +//============================================================================== + int main(int argc, char* argv[]) { - if ( c3n == _test_book_core() ) { - fprintf(stderr, "book tests failed\r\n"); + c3_i ret_i = 1; + + // boundary tests + ret_i &= _test_empty_log_operations(); + ret_i &= _test_single_event_lifecycle(); + ret_i &= _test_epoch_boundary_validation(); + ret_i &= _test_contiguity_gap_rejection(); + ret_i &= _test_minimum_event_size(); + + // crash recovery & corruption tests + ret_i &= _test_crc_corruption_detection(); + ret_i &= _test_truncated_file_recovery(); + ret_i &= _test_garbage_after_valid_deeds(); + ret_i &= _test_length_trailer_mismatch(); + + // iterator tests + ret_i &= _test_walk_single_event(); + ret_i &= _test_walk_invalidation(); + ret_i &= _test_walk_range_validation(); + + // header & format tests + ret_i &= _test_invalid_magic(); + ret_i &= _test_invalid_version(); + ret_i &= _test_header_only_file(); + ret_i &= _test_undersized_file(); + + // metadata tests + ret_i &= _test_metadata_roundtrip(); + ret_i &= _test_metadata_invalid_key(); + ret_i &= _test_metadata_size_validation(); + + fprintf(stderr, "\r\n"); + if ( ret_i ) { + fprintf(stderr, "book_tests: ok\n"); + return 0; + } + else { + fprintf(stderr, "book_tests: failed\n"); return 1; } - - fprintf(stderr, "test book: ok\n"); - return 0; } From 89704e1ebc86363b2028c777bade8e54408ff41b Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 19 Jan 2026 13:57:29 -0500 Subject: [PATCH 16/38] wip: uses lmdb for top-level metadata --- pkg/vere/disk.c | 123 +++++++++++++++++++++++++++++------------------- pkg/vere/mars.c | 9 ++-- pkg/vere/vere.h | 9 ++-- 3 files changed, 86 insertions(+), 55 deletions(-) diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index 3d58e1f963..98c2f8d919 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -5,6 +5,7 @@ #include "vere.h" #include "version.h" #include "db/book.h" +#include "db/lmdb.h" #include #include "migrate.h" @@ -461,27 +462,27 @@ u3_disk_walk_done(u3_disk_walk* wok_u) c3_free(wok_u); } -/* _disk_save_meta(): serialize atom, save as metadata at [key_c]. +/* _disk_save_meta(): save metadata field to LMDB. */ static c3_o -_disk_save_meta(u3_book* txt_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) +_disk_save_meta(MDB_env* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) { - return u3_book_save_meta(txt_u, key_c, len_w, byt_y); + return u3_lmdb_save_meta(mdb_u, key_c, len_w, byt_y); } -/* u3_disk_save_meta(): save metadata. +/* u3_disk_save_meta(): save metadata to LMDB. */ c3_o -u3_disk_save_meta(u3_book* txt_u, const u3_meta* met_u) +u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) { u3_assert( c3y == u3a_is_cat(met_u->lif_w) ); u3_noun who = u3i_chubs(2, met_u->who_d); - if ( (c3n == _disk_save_meta(txt_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) - || (c3n == _disk_save_meta(txt_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) - || (c3n == _disk_save_meta(txt_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) - || (c3n == _disk_save_meta(txt_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) + if ( (c3n == _disk_save_meta(mdb_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) + || (c3n == _disk_save_meta(mdb_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) + || (c3n == _disk_save_meta(mdb_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) + || (c3n == _disk_save_meta(mdb_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) { u3z(who); return c3n; @@ -492,24 +493,25 @@ u3_disk_save_meta(u3_book* txt_u, const u3_meta* met_u) } -/* u3_disk_save_meta_meta(): save meta metadata. +/* u3_disk_save_meta_meta(): save meta metadata using LMDB. */ c3_o u3_disk_save_meta_meta(c3_c* log_c, const u3_meta* met_u) { - u3_book* dbm_u; + MDB_env* mdb_u; - if ( 0 == (dbm_u = u3_book_init(log_c)) ) { - fprintf(stderr, "disk: failed to initialize meta-book\r\n"); + if ( 0 == (mdb_u = u3_lmdb_init(log_c, 1ULL << 30)) ) { + fprintf(stderr, "disk: failed to initialize lmdb for metadata\r\n"); return c3n; } - if ( c3n == u3_disk_save_meta(dbm_u, met_u) ) { + if ( c3n == u3_disk_save_meta(mdb_u, met_u) ) { fprintf(stderr, "disk: failed to save metadata\r\n"); + u3_lmdb_exit(mdb_u); return c3n; } - u3_book_exit(dbm_u); + u3_lmdb_exit(mdb_u); return c3y; } @@ -537,10 +539,10 @@ _disk_meta_read_cb(void* ptr_v, ssize_t val_i, void* val_v) } } -/* u3_disk_read_meta(): read metadata. +/* u3_disk_read_meta(): read metadata from LMDB. */ c3_o -u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u) +u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) { c3_w ver_w, lif_w; c3_d who_d[2]; @@ -550,13 +552,13 @@ u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u) // version // - u3_book_read_meta(txt_u, &val_u, "version", _disk_meta_read_cb); + u3_lmdb_read_meta(mdb_u, &val_u, "version", _disk_meta_read_cb); ver_w = val_u.buf_y[0]; // identity // - u3_book_read_meta(txt_u, &val_u, "who", _disk_meta_read_cb); + u3_lmdb_read_meta(mdb_u, &val_u, "who", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no identity\r\n"); @@ -591,7 +593,7 @@ u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u) // fake bit // - u3_book_read_meta(txt_u, &val_u, "fake", _disk_meta_read_cb); + u3_lmdb_read_meta(mdb_u, &val_u, "fake", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no fake bit\r\n"); @@ -611,7 +613,7 @@ u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u) // life // - u3_book_read_meta(txt_u, &val_u, "life", _disk_meta_read_cb); + u3_lmdb_read_meta(mdb_u, &val_u, "life", _disk_meta_read_cb); if ( 0 > val_u.hav_i ) { fprintf(stderr, "disk: read meta: no lifecycle length\r\n"); @@ -651,7 +653,7 @@ u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u) } // NB: we read metadata from LMDB even when met_u is null because sometimes - // because sometimes we call this just to ensure metadata exists + // we call this just to ensure metadata exists if ( met_u ) { met_u->ver_w = ver_w; memcpy(met_u->who_d, who_d, 2 * sizeof(c3_d)); @@ -823,9 +825,19 @@ u3_disk_exit(u3_disk* log_u) return; } - // close database + // close LMDB metadata environment (if still open) // - u3_book_exit(log_u->txt_u); + if ( log_u->mdb_u ) { + u3_lmdb_exit(log_u->mdb_u); + log_u->mdb_u = 0; + } + + // close epoch event log (book) + // + if ( log_u->txt_u ) { + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; + } // dispose planned writes // @@ -1143,14 +1155,18 @@ _disk_epoc_roll(u3_disk* log_u, c3_d epo_d) } #endif - // get metadata from old log, update version + // get metadata from top-level LMDB, update version u3_meta old_u; - if ( c3y != u3_disk_read_meta(log_u->txt_u, &old_u) ) { + if ( c3y != u3_disk_read_meta(log_u->mdb_u, &old_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); goto fail3; } - u3_book_exit(log_u->txt_u); - log_u->txt_u = 0; + + // close old epoch book if still open + if ( log_u->txt_u ) { + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; + } // initialize db of new epoch if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { @@ -1159,10 +1175,13 @@ _disk_epoc_roll(u3_disk* log_u, c3_d epo_d) goto fail3; } - // write the metadata to the database + // write the metadata to the epoch's book old_u.ver_w = U3D_VERLAT; - if ( c3n == u3_disk_save_meta(log_u->txt_u, &old_u) ) { - fprintf(stderr, "disk: failed to save metadata\r\n"); + if ( c3n == u3_book_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&old_u.ver_w) + || c3n == u3_book_save_meta(log_u->txt_u, "who", sizeof(old_u.who_d), (c3_y*)old_u.who_d) + || c3n == u3_book_save_meta(log_u->txt_u, "fake", sizeof(c3_o), (c3_y*)&old_u.fak_o) + || c3n == u3_book_save_meta(log_u->txt_u, "life", sizeof(c3_w), (c3_y*)&old_u.lif_w) ) { + fprintf(stderr, "disk: failed to save metadata to epoch\r\n"); goto fail3; } @@ -1338,16 +1357,16 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) // NB: requires that log_u->txt_u is initialized to log/data.mdb // XX: put old log in separate pointer (old_u?)? - // get metadata from old log, update version + // get metadata from top-level LMDB, update version u3_meta olm_u; - if ( c3y != u3_disk_read_meta(log_u->txt_u, &olm_u) ) { + if ( c3y != u3_disk_read_meta(log_u->mdb_u, &olm_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); return c3n; } - // finish with old log - u3_book_exit(log_u->txt_u); - log_u->txt_u = 0; + // finish with old log LMDB (will be re-initialized for epoch) + u3_lmdb_exit(log_u->mdb_u); + log_u->mdb_u = 0; // check if lock.mdb is readable in log directory c3_o luk_o = c3n; @@ -1421,8 +1440,11 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) } olm_u.ver_w = U3D_VERLAT; - if ( c3n == u3_disk_save_meta(log_u->txt_u, &olm_u) ) { - fprintf(stderr, "disk: failed to save metadata\r\n"); + if ( c3n == u3_book_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&olm_u.ver_w) + || c3n == u3_book_save_meta(log_u->txt_u, "who", sizeof(olm_u.who_d), (c3_y*)olm_u.who_d) + || c3n == u3_book_save_meta(log_u->txt_u, "fake", sizeof(c3_o), (c3_y*)&olm_u.fak_o) + || c3n == u3_book_save_meta(log_u->txt_u, "life", sizeof(c3_w), (c3_y*)&olm_u.lif_w) ) { + fprintf(stderr, "disk: failed to save metadata to book\r\n"); return c3n; } @@ -1686,9 +1708,9 @@ _disk_migrate_old(u3_disk* log_u) case U3D_VER1: { _disk_migrate_loom(log_u->dir_u->pax_c, las_d); - // set version to 2 (migration in progress) + // set version to 2 (migration in progress) in top-level LMDB log_u->ver_w = U3D_VER2; - if ( c3n == _disk_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { + if ( c3n == _disk_save_meta(log_u->mdb_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { fprintf(stderr, "disk: failed to set version to 2\r\n"); exit(1); } @@ -1784,9 +1806,9 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) if ( (u3_dlod_boot != lod_e) && !fir_d - && !las_d - && (c3n == u3_disk_read_meta(log_u->txt_u, 0)) ) + && !las_d ) { + // empty epoch (no events and no metadata) u3_book_exit(log_u->txt_u); log_u->txt_u = 0; return _epoc_void; @@ -2035,12 +2057,12 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) return log_u; } - // read metadata (version) from old log / top-level + // read metadata (version) from top-level LMDB // { u3_meta met_u; - if ( (0 == (log_u->txt_u = u3_book_init(log_c))) - || (c3n == u3_disk_read_meta(log_u->txt_u, &met_u)) ) + if ( (0 == (log_u->mdb_u = u3_lmdb_init(log_c, 1ULL << 30))) + || (c3n == u3_disk_read_meta(log_u->mdb_u, &met_u)) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); c3_free(log_u); // XX leaks dire(s) @@ -2059,14 +2081,19 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) fprintf(stderr, "migration required, replay disallowed\r\n"); exit(1); } + // for old ships, also open the top-level book for event data + if ( 0 == (log_u->txt_u = u3_book_init(log_c)) ) { + fprintf(stderr, "disk: failed to open old book\r\n"); + c3_free(log_u); // XX leaks dire(s) + return 0; + } _disk_migrate_old(log_u); log_u->liv_o = c3y; return log_u; } - // close top-level book - u3_book_exit(log_u->txt_u); - log_u->txt_u = 0; + // keep top-level LMDB metadata environment open for later access + // (txt_u will be initialized for the epoch next) // get latest epoch number c3_d lat_d; diff --git a/pkg/vere/mars.c b/pkg/vere/mars.c index af5b65373a..48745f200c 100644 --- a/pkg/vere/mars.c +++ b/pkg/vere/mars.c @@ -1295,7 +1295,7 @@ u3_mars_play(u3_mars* mar_u, c3_d eve_d, c3_d sap_d) if ( !mar_u->dun_d ) { u3_meta met_u; - if ( c3n == u3_disk_read_meta(log_u->txt_u, &met_u) ) { + if ( c3n == u3_disk_read_meta(log_u->mdb_u, &met_u) ) { fprintf(stderr, "mars: disk read meta fail\r\n"); // XX exit code, cb // @@ -1451,7 +1451,7 @@ u3_mars_load(u3_mars* mar_u, u3_disk_load_e lod_e) mar_u->sen_d = mar_u->dun_d = u3A->eve_d; mar_u->mug_l = u3r_mug(u3A->roc); - if ( c3n == u3_disk_read_meta(mar_u->log_u->txt_u, &(mar_u->met_u)) ) { + if ( c3n == u3_disk_read_meta(mar_u->log_u->mdb_u, &(mar_u->met_u)) ) { fprintf(stderr, "mars: disk meta fail\r\n"); u3_disk_exit(mar_u->log_u); exit(1); // XX @@ -1949,7 +1949,10 @@ u3_mars_boot(u3_mars* mar_u, c3_d len_d, c3_y* hun_y) exit(1); // XX cleanup } - if ( c3n == u3_disk_save_meta(log_u->txt_u, &met_u) ) { + if ( c3n == u3_book_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&met_u.ver_w) + || c3n == u3_book_save_meta(log_u->txt_u, "who", sizeof(met_u.who_d), (c3_y*)met_u.who_d) + || c3n == u3_book_save_meta(log_u->txt_u, "fake", sizeof(c3_o), (c3_y*)&met_u.fak_o) + || c3n == u3_book_save_meta(log_u->txt_u, "life", sizeof(c3_w), (c3_y*)&met_u.lif_w) ) { exit(1); // XX cleanup } diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index cd4460aef5..32e3b4bc84 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -533,6 +533,7 @@ c3_i lok_i; // lockfile c3_o liv_o; // live c3_w ver_w; // version (see version.h) + MDB_env* mdb_u; // lmdb env for top-level metadata u3_book* txt_u; // book env of current epoch c3_d sen_d; // commit requested c3_d dun_d; // committed @@ -875,15 +876,15 @@ void u3_disk_exit(u3_disk* log_u); - /* u3_disk_read_meta(): read metadata. + /* u3_disk_read_meta(): read metadata from LMDB. */ c3_o - u3_disk_read_meta(u3_book* txt_u, u3_meta* met_u); + u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u); - /* u3_disk_save_meta(): save metadata. + /* u3_disk_save_meta(): save metadata to LMDB. */ c3_o - u3_disk_save_meta(u3_book* txt_u, const u3_meta* met_u); + u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u); /* u3_disk_save_meta_meta(): save meta metadata. */ From 2d62e24ff54addd98277a27527c3e5e40881dbb1 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Wed, 21 Jan 2026 14:16:13 -0500 Subject: [PATCH 17/38] disk: remove unused function and use lmdb for top-level metadata --- pkg/vere/db/book.c | 20 +++++--------------- pkg/vere/disk.c | 25 +++++++++---------------- 2 files changed, 14 insertions(+), 31 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index a2d60c82a8..fab8d3ede6 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -39,32 +39,22 @@ _book_crc32_two(c3_y* one_y, c3_w one_w, c3_y* two_y, c3_w two_w) return (c3_w)crc32(crc_w, two_y, two_w); } -/* _book_meta_path(): construct path to meta.bin from book.log path. +/* _book_meta_path(): construct path to meta.bin from book directory path. ** +** pax_c should be a directory path (the one passed to u3_book_init) ** caller must free result with c3_free() */ static c3_c* _book_meta_path(const c3_c* pax_c) { c3_c* met_c = c3_malloc(strlen(pax_c) + 16); - c3_c* dir_c = c3_malloc(strlen(pax_c) + 1); - if ( !met_c || !dir_c ) { - c3_free(met_c); - c3_free(dir_c); + if ( !met_c ) { return 0; } - strcpy(dir_c, pax_c); - - // find last '/' to get directory - c3_c* sla_c = strrchr(dir_c, '/'); - if ( sla_c ) { - *sla_c = '\0'; - } - - snprintf(met_c, strlen(pax_c) + 16, "%s/meta.bin", dir_c); - c3_free(dir_c); + // pax_c is already the directory, just append /meta.bin + snprintf(met_c, strlen(pax_c) + 16, "%s/meta.bin", pax_c); return met_c; } diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index 98c2f8d919..0ba591bc3f 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -462,14 +462,6 @@ u3_disk_walk_done(u3_disk_walk* wok_u) c3_free(wok_u); } -/* _disk_save_meta(): save metadata field to LMDB. -*/ -static c3_o -_disk_save_meta(MDB_env* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) -{ - return u3_lmdb_save_meta(mdb_u, key_c, len_w, byt_y); -} - /* u3_disk_save_meta(): save metadata to LMDB. */ c3_o @@ -479,10 +471,10 @@ u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) u3_noun who = u3i_chubs(2, met_u->who_d); - if ( (c3n == _disk_save_meta(mdb_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) - || (c3n == _disk_save_meta(mdb_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) - || (c3n == _disk_save_meta(mdb_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) - || (c3n == _disk_save_meta(mdb_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) + if ( (c3n == u3_lmdb_save_meta(mdb_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) + || (c3n == u3_lmdb_save_meta(mdb_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) + || (c3n == u3_lmdb_save_meta(mdb_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) + || (c3n == u3_lmdb_save_meta(mdb_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) { u3z(who); return c3n; @@ -1710,7 +1702,7 @@ _disk_migrate_old(u3_disk* log_u) // set version to 2 (migration in progress) in top-level LMDB log_u->ver_w = U3D_VER2; - if ( c3n == _disk_save_meta(log_u->mdb_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { + if ( c3n == u3_lmdb_save_meta(log_u->mdb_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { fprintf(stderr, "disk: failed to set version to 2\r\n"); exit(1); } @@ -2081,13 +2073,12 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) fprintf(stderr, "migration required, replay disallowed\r\n"); exit(1); } - // for old ships, also open the top-level book for event data - if ( 0 == (log_u->txt_u = u3_book_init(log_c)) ) { + // for old ships, also open the top-level lmdb file for metadata + if ( 0 == (log_u->mdb_u = u3_lmdb_init(log_c, 1ULL << 30)) ) { fprintf(stderr, "disk: failed to open old book\r\n"); c3_free(log_u); // XX leaks dire(s) return 0; } - _disk_migrate_old(log_u); log_u->liv_o = c3y; return log_u; } @@ -2167,4 +2158,6 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) } } } + + return log_u; } From 3d994bd7a400356e238033dc7a186e9f4747bbad Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Thu, 22 Jan 2026 20:21:25 -0500 Subject: [PATCH 18/38] disk: implements book migration --- pkg/noun/version.h | 3 +- pkg/vere/disk.c | 128 +++++++++++++++++++++++++++++++++++---------- pkg/vere/vere.h | 4 +- 3 files changed, 105 insertions(+), 30 deletions(-) diff --git a/pkg/noun/version.h b/pkg/noun/version.h index 88c6ac400a..01e44e79d7 100644 --- a/pkg/noun/version.h +++ b/pkg/noun/version.h @@ -38,6 +38,7 @@ typedef c3_w u3e_version; */ #define U3E_VER1 1 // north+south.bin #define U3E_VER2 2 // image.bin -#define U3E_VERLAT U3E_VER2 +#define U3E_VER3 3 // book.log (append-only event log) +#define U3E_VERLAT U3E_VER3 #endif /* ifndef U3_VERSION_H */ diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index 0ba591bc3f..0ac54dc365 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -462,7 +462,7 @@ u3_disk_walk_done(u3_disk_walk* wok_u) c3_free(wok_u); } -/* u3_disk_save_meta(): save metadata to LMDB. +/* u3_disk_save_meta(): save metadata to lmdb. */ c3_o u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) @@ -485,7 +485,7 @@ u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) } -/* u3_disk_save_meta_meta(): save meta metadata using LMDB. +/* u3_disk_save_meta_meta(): save meta metadata using lmdb. */ c3_o u3_disk_save_meta_meta(c3_c* log_c, const u3_meta* met_u) @@ -531,7 +531,7 @@ _disk_meta_read_cb(void* ptr_v, ssize_t val_i, void* val_v) } } -/* u3_disk_read_meta(): read metadata from LMDB. +/* u3_disk_read_meta(): read metadata from lmdb. */ c3_o u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) @@ -644,7 +644,7 @@ u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) } } - // NB: we read metadata from LMDB even when met_u is null because sometimes + // NB: we read metadata from lmdb even when met_u is null because sometimes // we call this just to ensure metadata exists if ( met_u ) { met_u->ver_w = ver_w; @@ -817,7 +817,7 @@ u3_disk_exit(u3_disk* log_u) return; } - // close LMDB metadata environment (if still open) + // close lmdb metadata environment (if still open) // if ( log_u->mdb_u ) { u3_lmdb_exit(log_u->mdb_u); @@ -1147,7 +1147,7 @@ _disk_epoc_roll(u3_disk* log_u, c3_d epo_d) } #endif - // get metadata from top-level LMDB, update version + // get metadata from top-level lmdb, update version u3_meta old_u; if ( c3y != u3_disk_read_meta(log_u->mdb_u, &old_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); @@ -1349,14 +1349,14 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) // NB: requires that log_u->txt_u is initialized to log/data.mdb // XX: put old log in separate pointer (old_u?)? - // get metadata from top-level LMDB, update version + // get metadata from top-level lmdb, update version u3_meta olm_u; if ( c3y != u3_disk_read_meta(log_u->mdb_u, &olm_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); return c3n; } - // finish with old log LMDB (will be re-initialized for epoch) + // finish with old log lmdb (will be re-initialized for epoch) u3_lmdb_exit(log_u->mdb_u); log_u->mdb_u = 0; @@ -1700,7 +1700,7 @@ _disk_migrate_old(u3_disk* log_u) case U3D_VER1: { _disk_migrate_loom(log_u->dir_u->pax_c, las_d); - // set version to 2 (migration in progress) in top-level LMDB + // set version to 2 (migration in progress) in top-level lmdb log_u->ver_w = U3D_VER2; if ( c3n == u3_lmdb_save_meta(log_u->mdb_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { fprintf(stderr, "disk: failed to set version to 2\r\n"); @@ -1778,22 +1778,48 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) c3_c epo_c[8193]; snprintf(epo_c, 8192, "%s/0i%" PRIc3_d, log_u->com_u->pax_c, lat_d); - // initialize latest epoch's db - if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { - fprintf(stderr, "disk: failed to initialize database at %s\r\n", - epo_c); - return _epoc_fail; - } + // for U3E_VER1 and U3E_VER2 epochs, we need special handling + // both use lmdb format, but the new system uses book.log + // we read metadata from the old lmdb then trigger migration via rollover + c3_d fir_d, las_d; + + if ( U3E_VER2 >= ver_w ) { + // open with lmdb temporarily to get first and last events + MDB_env* mdb_u = u3_lmdb_init(epo_c, 1ULL << 30); + if ( 0 == mdb_u ) { + fprintf(stderr, "disk: failed to initialize lmdb at %s\r\n", epo_c); + return _epoc_fail; + } - fprintf(stderr, "disk: loaded epoch 0i%" PRIc3_d "\r\n", lat_d); + // get first/last event numbers from lmdb + if ( c3n == u3_lmdb_gulf(mdb_u, &fir_d, &las_d) ) { + fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); + u3_lmdb_exit(mdb_u); + return _epoc_fail; + } - // get first/last event numbers from book - c3_d fir_d, las_d; - if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { - fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); - u3_book_exit(log_u->txt_u); + u3_lmdb_exit(mdb_u); + + // store null for txt_u to indicate lmdb-format epoch (will need migration) log_u->txt_u = 0; - return _epoc_fail; + } + else { + // initialize latest epoch's db for U3E_VER3+ (book format) + if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { + fprintf(stderr, "disk: failed to initialize database at %s\r\n", + epo_c); + return _epoc_fail; + } + + fprintf(stderr, "disk: loaded epoch 0i%" PRIc3_d "\r\n", lat_d); + + // get first/last event numbers from book + if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { + fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; + return _epoc_fail; + } } if ( (u3_dlod_boot != lod_e) @@ -1819,6 +1845,12 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) // switch ( ver_w ) { case U3E_VER1: { + // migration from U3E_VER1 (lmdb with loom files) to U3E_VER3 (book.log) + // txt_u is null for U3E_VER1 since we can't keep lmdb epoch open + // we must perform loom migration and then rollover to new format epoch + // + fprintf(stderr, "disk: epoch v1 detected, migrating to v3...\r\n"); + if ( u3_dlod_epoc == lod_e ) { fprintf(stderr, "migration required, replay disallowed\r\n"); exit(1); @@ -1834,10 +1866,18 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) } _disk_unlink_stale_loom(log_u->dir_u->pax_c); + fprintf(stderr, "disk: epoch v3 migration done\r\n"); + return _epoc_good; } break; case U3E_VER2: { + // migration from U3E_VER2 (data.mdb) to U3E_VER3 (book.log) + // txt_u is null for U3E_VER2 since we can't keep lmdb epoch open + // we must trigger an immediate rollover to create the new format epoch + // + fprintf(stderr, "disk: epoch v2 detected, migrating to v3...\r\n"); + if ( u3_dlod_epoc == lod_e ) { c3_c chk_c[8193]; snprintf(chk_c, 8193, "%s/.urb/chk", log_u->dir_u->pax_c); @@ -1876,10 +1916,22 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) exit(1); } - if ( (u3C.wag_w & u3o_yolo) // XX better argument to disable autoroll - || (!log_u->epo_d && log_u->dun_d && !u3A->eve_d) - || (c3n == _disk_vere_diff(log_u)) ) + // for U3E_VER2, we always need to perform rollover + // this creates a new epoch in U3E_VER3 format while keeping the old one + if ( log_u->dun_d == u3A->eve_d ) { + fprintf(stderr, "disk: rolling over to new U3E_VER3 epoch\r\n"); + if ( c3n == _disk_epoc_roll(log_u, log_u->dun_d) ) { + fprintf(stderr, "disk: failed to roll over epoch\r\n"); + exit(1); + } + fprintf(stderr, "disk: epoch v3 migration done\r\n"); + return _epoc_good; + } + + if ( (u3C.wag_w & u3o_yolo) + || (!log_u->epo_d && log_u->dun_d && !u3A->eve_d) ) { + // ok to proceed without rollover in special cases return _epoc_good; } else if ( log_u->dun_d != u3A->eve_d ) { @@ -1892,6 +1944,28 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) fprintf(stderr, "disk: failed to initialize epoch\r\n"); exit(1); } + + fprintf(stderr, "disk: epoch v3 migration done\r\n"); + return _epoc_good; + } break; + + case U3E_VER3: { + u3m_boot(log_u->dir_u->pax_c, (size_t)1 << u3_Host.ops_u.lom_y); // XX confirm + + if ( log_u->dun_d < u3A->eve_d ) { + // XX bad, add to enum + fprintf(stderr, "mars: corrupt pier, snapshot (%" PRIu64 + ") from future (log=%" PRIu64 ")\r\n", + u3A->eve_d, log_u->dun_d); + exit(1); + } + else if ( u3A->eve_d < log_u->epo_d ) { + // XX goto full replay + fprintf(stderr, "mars: corrupt pier, snapshot (%" PRIu64 + ") out of epoch (%" PRIu64 ")\r\n", + u3A->eve_d, log_u->epo_d); + exit(1); + } return _epoc_good; } break; @@ -2049,7 +2123,7 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) return log_u; } - // read metadata (version) from top-level LMDB + // read metadata (version) from top-level lmdb // { u3_meta met_u; @@ -2083,7 +2157,7 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) return log_u; } - // keep top-level LMDB metadata environment open for later access + // keep top-level lmdb metadata environment open for later access // (txt_u will be initialized for the epoch next) // get latest epoch number diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index 32e3b4bc84..6c6ce1db44 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -876,12 +876,12 @@ void u3_disk_exit(u3_disk* log_u); - /* u3_disk_read_meta(): read metadata from LMDB. + /* u3_disk_read_meta(): read metadata from lmdb. */ c3_o u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u); - /* u3_disk_save_meta(): save metadata to LMDB. + /* u3_disk_save_meta(): save metadata to lmdb. */ c3_o u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u); From 4950a2ba5ddf36c5aa73d71b7a8881d4a157c252 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Thu, 22 Jan 2026 21:32:02 -0500 Subject: [PATCH 19/38] book: ensures consistency of scans --- pkg/noun/version.h | 2 +- pkg/vere/db/book.c | 35 +++++++++++++++++++---------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pkg/noun/version.h b/pkg/noun/version.h index 01e44e79d7..b19f7289a7 100644 --- a/pkg/noun/version.h +++ b/pkg/noun/version.h @@ -38,7 +38,7 @@ typedef c3_w u3e_version; */ #define U3E_VER1 1 // north+south.bin #define U3E_VER2 2 // image.bin -#define U3E_VER3 3 // book.log (append-only event log) +#define U3E_VER3 3 // book.log #define U3E_VERLAT U3E_VER3 #endif /* ifndef U3_VERSION_H */ diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index fab8d3ede6..9309863d9d 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -491,16 +491,19 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) return c3y; } -/* _book_scan_end(): scan to find actual end of valid events. +/* _book_scan_fore(): scan to find last valid deed. ** ** validates each record's CRC and len_d == let_d. -** caches las_d and off_d in txt_u. -** returns offset to append next event. +** on success, sets *off_d to append offset and updates txt_u->las_d. +** +** returns: +** c3y: success +** c3n: failure (empty file or no valid deeds) */ -static c3_d -_book_scan_end(u3_book* txt_u) +static c3_o +_book_scan_fore(u3_book* txt_u, c3_d* off_d) { - c3_d off_d = sizeof(u3_book_head); // start of events + c3_d cur_d = sizeof(u3_book_head); // start of events c3_d cot_d = 0; // count c3_d las_d = 0; // last valid event found c3_d exp_d; // expected event number @@ -508,25 +511,25 @@ _book_scan_end(u3_book* txt_u) if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->las_d ) { // empty log txt_u->las_d = 0; - txt_u->off_d = off_d; - return off_d; + *off_d = cur_d; + return c3n; } exp_d = txt_u->las_d - txt_u->hed_u.fir_d + 1; while ( 1 ) { u3_book_reed red_u; - c3_d off_start = off_d; + c3_d beg_d = cur_d; // read deed into reed - if ( c3n == _book_read_deed(txt_u->fid_i, &off_d, &red_u) ) { + if ( c3n == _book_read_deed(txt_u->fid_i, &cur_d, &red_u) ) { // EOF or read error break; } // validate reed (CRC and length checks) if ( c3n == _book_okay_reed(&red_u) ) { - u3l_log("book: validation failed at offset %" PRIu64 "\r\n", off_start); + u3l_log("book: validation failed at offset %" PRIu64 "\r\n", beg_d); c3_free(red_u.jam_y); break; } @@ -544,13 +547,13 @@ _book_scan_end(u3_book* txt_u) // update las_d based on what we found if ( 0 == cot_d ) { txt_u->las_d = 0; - off_d = sizeof(u3_book_head); + cur_d = sizeof(u3_book_head); } else { txt_u->las_d = las_d; } // truncate file - if ( -1 == ftruncate(txt_u->fid_i, off_d) ) { + if ( -1 == ftruncate(txt_u->fid_i, cur_d) ) { u3l_log("book: failed to truncate: %s\r\n", strerror(errno)); } else { @@ -563,8 +566,8 @@ _book_scan_end(u3_book* txt_u) txt_u->las_d = las_d; } - txt_u->off_d = off_d; - return off_d; + *off_d = cur_d; + return c3y; } /* u3_book_init(): open/create event log. @@ -647,7 +650,7 @@ u3_book_init(const c3_c* pax_c) // try fast reverse scan first, fall back to forward scan if needed if ( c3n == _book_scan_back(txt_u, &txt_u->off_d) ) { // reverse scan failed, use forward scan for recovery - _book_scan_end(txt_u); + _book_scan_fore(txt_u, &txt_u->off_d); } } From 720a29624708c9146acc07afc123a9ebbc7791d2 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Thu, 22 Jan 2026 21:59:04 -0500 Subject: [PATCH 20/38] book: cleans, clarifies, and adds metadata to stat --- pkg/vere/db/book.c | 48 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 9309863d9d..7a3b1d27af 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -12,6 +12,7 @@ #include "c3/c3.h" #include "noun.h" +#include "ship.h" // book: append-only event log // @@ -575,19 +576,19 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) u3_book* u3_book_init(const c3_c* pax_c) { - c3_c path_c[8193]; + c3_c log_c[8193]; c3_i fid_i, met_i; struct stat buf_u; u3_book* txt_u; // construct path to book.log - snprintf(path_c, sizeof(path_c), "%s/book.log", pax_c); + snprintf(log_c, sizeof(log_c), "%s/book.log", pax_c); // open or create file - fid_i = c3_open(path_c, O_RDWR | O_CREAT, 0644); + fid_i = c3_open(log_c, O_RDWR | O_CREAT, 0644); if ( 0 > fid_i ) { u3l_log("book: failed to open %s: %s\r\n", - path_c, strerror(errno)); + log_c, strerror(errno)); return 0; } @@ -611,14 +612,14 @@ u3_book_init(const c3_c* pax_c) txt_u = c3_calloc(sizeof(u3_book)); txt_u->fid_i = fid_i; txt_u->met_i = met_i; - txt_u->pax_c = c3_malloc(strlen(path_c) + 1); + txt_u->pax_c = c3_malloc(strlen(log_c) + 1); if ( !txt_u->pax_c ) { close(fid_i); close(met_i); c3_free(txt_u); return 0; } - strcpy(txt_u->pax_c, path_c); + strcpy(txt_u->pax_c, log_c); if ( buf_u.st_size == 0 ) { // new file: initialize and write header @@ -694,19 +695,19 @@ u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d) return c3y; } -/* u3_book_stat(): print book statistics. +/* u3_book_stat(): print book statistics. expects path to book.log. */ void -u3_book_stat(const c3_c* pax_c) +u3_book_stat(const c3_c* log_c) { c3_i fid_i; u3_book_head hed_u; struct stat buf_u; // open the file directly - fid_i = c3_open(pax_c, O_RDONLY, 0); + fid_i = c3_open(log_c, O_RDONLY, 0); if ( fid_i < 0 ) { - fprintf(stderr, "book: failed to open %s: %s\r\n", pax_c, strerror(errno)); + fprintf(stderr, "book: failed to open %s: %s\r\n", log_c, strerror(errno)); return; } @@ -736,11 +737,34 @@ u3_book_stat(const c3_c* pax_c) } fprintf(stderr, "book info:\r\n"); - fprintf(stderr, " file: %s\r\n", pax_c); - fprintf(stderr, " version: %u\r\n", hed_u.ver_w); + fprintf(stderr, " file: %s\r\n", log_c); + fprintf(stderr, " format: %u\r\n", hed_u.ver_w); fprintf(stderr, " first event: %" PRIu64 "\r\n", hed_u.fir_d); fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); + // read metadata from meta.bin + u3_book_meta met_u; + c3_c* epo_c = c3_malloc(strlen(log_c) - 8); + if ( epo_c ) { + strncpy(epo_c, log_c, strlen(log_c) - 9); + epo_c[strlen(log_c) - 9] = '\0'; // lops "/book.log" + } + c3_c* met_c = _book_meta_path(epo_c); + c3_i met_i = c3_open(met_c, O_RDONLY, 0); + + if ( met_i >= 0 ) { + c3_zs ret_zs = pread(met_i, &met_u, sizeof(u3_book_meta), 0); + if ( ret_zs == sizeof(u3_book_meta) ) { + fprintf(stderr, "\r\ndisk metadata:\r\n"); + fprintf(stderr, " who: %s\r\n", u3_ship_to_string(met_u.who_d)); + fprintf(stderr, " version: %u\r\n", met_u.ver_w); + fprintf(stderr, " fake: %s\r\n", _(met_u.fak_o) ? "yes" : "no"); + fprintf(stderr, " life: %u\r\n", met_u.lif_w); + } + close(met_i); + } + c3_free(met_c); + close(fid_i); } From ef589dab9b1be76edb8b482bd4d4a417c0d1faee Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Thu, 22 Jan 2026 22:19:52 -0500 Subject: [PATCH 21/38] book: fixes leak, improves test safety, removes arbitrary event size limit --- pkg/vere/book_tests.c | 4 ++++ pkg/vere/db/book.c | 8 ++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index 75271a4d56..b111c626c5 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -36,6 +36,10 @@ _test_make_tmpdir(void) static void _test_rm_rf(const c3_c* pax_c) { + if ( !pax_c || strncmp(pax_c, "/tmp", 4) != 0 ) { + fprintf(stderr, "book_test: refusing to remove non-/tmp path: %s\r\n", pax_c); + exit(1); + } c3_c cmd_c[8192]; snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", pax_c); system(cmd_c); diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 7a3b1d27af..31ab9aa402 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -442,11 +442,6 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) break; } - // validate let_d is reasonable - if ( 0 == let_d || (1ULL << 32) < let_d ) { - break; - } - // calculate deed size and start position siz_d = _book_deed_size(let_d); if ( siz_d > pos_d - sizeof(u3_book_head) ) { @@ -746,10 +741,11 @@ u3_book_stat(const c3_c* log_c) u3_book_meta met_u; c3_c* epo_c = c3_malloc(strlen(log_c) - 8); if ( epo_c ) { - strncpy(epo_c, log_c, strlen(log_c) - 9); + strncpy(epo_c, log_c, strlen(log_c) - 9); // XX brittle epo_c[strlen(log_c) - 9] = '\0'; // lops "/book.log" } c3_c* met_c = _book_meta_path(epo_c); + c3_free(epo_c); c3_i met_i = c3_open(met_c, O_RDONLY, 0); if ( met_i >= 0 ) { From 67d137874b9697f56fa6a32bb1c3fdc7ef763a49 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 23 Jan 2026 12:07:40 -0500 Subject: [PATCH 22/38] book: adds `eve_d` to `u3_book_deed_head` and improves `_book_scan_back` performance --- pkg/vere/db/book.c | 36 +++++++++++++++--------------------- pkg/vere/db/book.h | 2 ++ pkg/vere/disk.c | 2 +- 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 31ab9aa402..396373ce71 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -219,7 +219,7 @@ static inline c3_w _book_deed_size(c3_d len_d) { return sizeof(u3_book_deed_head) + (len_d - 4) + sizeof(u3_book_deed_tail); - // = 12 + (len_d - 4) + 12 = len_d + 20 + // = 20 + (len_d - 4) + 12 = len_d + 28 } /* _book_calc_crc(): compute CRC32 for reed. @@ -227,11 +227,12 @@ _book_deed_size(c3_d len_d) static c3_w _book_calc_crc(const u3_book_reed* red_u) { - c3_y buf_y[12]; // 8 bytes len_d + 4 bytes mug + c3_y buf_y[20]; // 8 bytes len_d + 8 bytes eve_d + 4 bytes mug memcpy(buf_y, &red_u->len_d, 8); - memcpy(buf_y + 8, &red_u->mug_l, 4); + memcpy(buf_y + 8, &red_u->eve_d, 8); + memcpy(buf_y + 16, &red_u->mug_l, 4); - return _book_crc32_two(buf_y, 12, red_u->jam_y, red_u->len_d - 4); + return _book_crc32_two(buf_y, 20, red_u->jam_y, red_u->len_d - 4); } /* _book_okay_reed(): validate reed integrity. @@ -278,6 +279,7 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) // populate reed from head red_u->len_d = hed_u.len_d; + red_u->eve_d = hed_u.eve_d; red_u->mug_l = hed_u.mug_l; // read jam data (len_d - mug bytes) @@ -334,6 +336,7 @@ _book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) // write deed_head u3_book_deed_head hed_u; hed_u.len_d = red_u->len_d; + hed_u.eve_d = red_u->eve_d; hed_u.mug_l = red_u->mug_l; ret_zs = pwrite(fid_i, &hed_u, sizeof(u3_book_deed_head), now_d); @@ -405,7 +408,6 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) struct stat buf_u; c3_d end_d; c3_d pos_d; - c3_d cot_d = 0; // count of valid deeds found // get file size if ( -1 == fstat(txt_u->fid_i, &buf_u) ) { @@ -465,26 +467,17 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) break; } + // deed is valid — use eve_d directly c3_free(red_u.jam_y); + *off_d = pos_d; + txt_u->las_d = red_u.eve_d; + return c3y; } - - // deed is valid, record position and continue backwards - cot_d++; - pos_d = ded_d; - } - - // check if we found any valid deeds - if ( 0 == cot_d ) { - *off_d = sizeof(u3_book_head); - return c3n; } - // success: compute last event number - // cot_d deeds found, first event is fir_d - *off_d = end_d; - txt_u->las_d = txt_u->hed_u.fir_d + cot_d - 1; - - return c3y; + // no valid deeds found + *off_d = sizeof(u3_book_head); + return c3n; } /* _book_scan_fore(): scan to find last valid deed. @@ -836,6 +829,7 @@ u3_book_save(u3_book* txt_u, memcpy(&red_u.mug_l, buf_y, 4); red_u.jam_y = buf_y + 4; red_u.len_d = siz_d; // total payload: mug + jam + red_u.eve_d = eve_d + i_w; red_u.crc_w = _book_calc_crc(&red_u); // save deed to file diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index fe257d1367..443ec81ef1 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -59,6 +59,7 @@ */ typedef struct _u3_book_deed_head { c3_d len_d; // payload size (mug + jam) + c3_d eve_d; // event number c3_l mug_l; // mug/hash } u3_book_deed_head; @@ -84,6 +85,7 @@ */ typedef struct _u3_book_reed { c3_d len_d; // total payload size + c3_d eve_d; // event number c3_l mug_l; // mug/hash c3_y* jam_y; // jam data (caller owns, len = len_d - 4) c3_w crc_w; // CRC32 checksum diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index 0ac54dc365..7263f746d3 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -2206,7 +2206,7 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) return 0; } - fprintf(stderr, "disk: latest epoch is 0i%" PRIc3_d " is bogus; " + fprintf(stderr, "disk: latest epoch 0i%" PRIc3_d " is bogus; " "falling back to previous at 0i%" PRIc3_d "\r\n", lat_d, sot_d[1]); From b7a696484ad0c2c1878d92ac79d101220b879655 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 26 Jan 2026 10:56:14 -0500 Subject: [PATCH 23/38] book: ensure `las_d` is initialized correctly in new, non-zero epochs --- pkg/vere/db/book.c | 49 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 396373ce71..f8367751f7 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -504,7 +504,9 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) return c3n; } - exp_d = txt_u->las_d - txt_u->hed_u.fir_d + 1; + exp_d = ( txt_u->las_d >= txt_u->hed_u.fir_d ) + ? txt_u->las_d - txt_u->hed_u.fir_d + 1 + : 0; while ( 1 ) { u3_book_reed red_u; @@ -559,7 +561,7 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) return c3y; } -/* u3_book_init(): open/create event log. +/* u3_book_init(): open/create event log in epoch directory. */ u3_book* u3_book_init(const c3_c* pax_c) @@ -612,8 +614,42 @@ u3_book_init(const c3_c* pax_c) if ( buf_u.st_size == 0 ) { // new file: initialize and write header _book_make_head(txt_u); - // initialize cache: empty log - txt_u->las_d = 0; + + // extract epoch number from path (last component matching 0iN) + const c3_c* las_c = strrchr(pax_c, '/'); + las_c = las_c ? las_c + 1 : pax_c; + + c3_d epo_d = 0; + if ( 0 == strncmp(las_c, "0i", 2) && las_c[2] ) { + epo_d = strtoull(las_c + 2, NULL, 10); + } + + if ( epo_d ) { + txt_u->hed_u.fir_d = epo_d; + + // persist fir_d (no need if epo_d is 0) + if ( sizeof(c3_d) != pwrite(fid_i, &txt_u->hed_u.fir_d, + sizeof(c3_d), offsetof(u3_book_head, fir_d)) ) + { + u3l_log("book: failed to write fir_d: %s\r\n", strerror(errno)); + close(fid_i); + close(met_i); + c3_free(txt_u->pax_c); + c3_free(txt_u); + return 0; + } + + if ( -1 == c3_sync(fid_i) ) { + u3l_log("book: failed to sync fir_d: %s\r\n", strerror(errno)); + close(fid_i); + close(met_i); + c3_free(txt_u->pax_c); + c3_free(txt_u); + return 0; + } + } + + txt_u->las_d = epo_d; txt_u->off_d = sizeof(u3_book_head); } else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { @@ -641,6 +677,11 @@ u3_book_init(const c3_c* pax_c) // reverse scan failed, use forward scan for recovery _book_scan_fore(txt_u, &txt_u->off_d); } + + // fir_d pre-initialized but no events found: set las_d to match + if ( txt_u->hed_u.fir_d && !txt_u->las_d ) { + txt_u->las_d = txt_u->hed_u.fir_d; + } } return txt_u; From 7c54c40e5fbb04345f252905529a5b4b232219c0 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 26 Jan 2026 14:13:45 -0500 Subject: [PATCH 24/38] book: cleans up and enforces `u3_book_init` is passed an epoch directory --- pkg/vere/db/book.c | 132 ++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index f8367751f7..fb13d49dc6 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -67,7 +67,7 @@ static c3_i _book_init_meta_file(const c3_c* pax_c) { c3_c* met_c = _book_meta_path(pax_c); - c3_i met_i = c3_open(met_c, O_RDWR | O_CREAT, 0644); + c3_i met_i = c3_open(met_c, O_RDWR | O_CREAT, 0644); if ( 0 > met_i ) { c3_free(met_c); @@ -77,31 +77,29 @@ _book_init_meta_file(const c3_c* pax_c) // check file size; if zero, initialize with blank metadata struct stat buf_u; if ( 0 > fstat(met_i, &buf_u) ) { - close(met_i); - c3_free(met_c); - return -1; + goto fail; } if ( 0 == buf_u.st_size ) { u3_book_meta met_u; memset(&met_u, 0, sizeof(u3_book_meta)); - - c3_zs ret_zs = pwrite(met_i, &met_u, sizeof(u3_book_meta), 0); - if ( ret_zs != sizeof(u3_book_meta) ) { - close(met_i); - c3_free(met_c); - return -1; + + if ( sizeof(u3_book_meta) != pwrite(met_i, &met_u, sizeof(u3_book_meta), 0) ) { + goto fail; } if ( -1 == c3_sync(met_i) ) { - close(met_i); - c3_free(met_c); - return -1; + goto fail; } } c3_free(met_c); return met_i; + +fail: + close(met_i); + c3_free(met_c); + return -1; } /* _book_read_meta_file(): read metadata from meta.bin. @@ -210,9 +208,6 @@ _book_read_head(u3_book* txt_u) return c3y; } - - - /* _book_deed_size(): calculate total on-disk size of deed. */ static inline c3_w @@ -254,6 +249,30 @@ _book_okay_reed(const u3_book_reed* red_u) return c3y; } +/* _book_reed_to_buff(): convert reed to mug+jam buffer format. +** +** allocates output buffer; caller must free. +** frees red_u->jam_y on success; caller must free on failure. +** +** returns: allocated buffer, or 0 on allocation failure +*/ +static c3_y* +_book_reed_to_buff(u3_book_reed* red_u, c3_z* len_z) +{ + *len_z = red_u->len_d; + c3_y* buf_y = c3_malloc(*len_z); + + if ( !buf_y ) { + return 0; + } + + memcpy(buf_y, &red_u->mug_l, 4); + memcpy(buf_y + 4, red_u->jam_y, red_u->len_d - 4); + c3_free(red_u->jam_y); + + return buf_y; +} + /* _book_read_deed(): read deed from file into [red_u]. ** ** returns: @@ -566,10 +585,10 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) u3_book* u3_book_init(const c3_c* pax_c) { - c3_c log_c[8193]; - c3_i fid_i, met_i; + c3_c log_c[8193]; + c3_i met_i, fid_i = -1; struct stat buf_u; - u3_book* txt_u; + u3_book* txt_u = 0; // construct path to book.log snprintf(log_c, sizeof(log_c), "%s/book.log", pax_c); @@ -577,8 +596,7 @@ u3_book_init(const c3_c* pax_c) // open or create file fid_i = c3_open(log_c, O_RDWR | O_CREAT, 0644); if ( 0 > fid_i ) { - u3l_log("book: failed to open %s: %s\r\n", - log_c, strerror(errno)); + u3l_log("book: failed to open %s: %s\r\n", log_c, strerror(errno)); return 0; } @@ -586,16 +604,13 @@ u3_book_init(const c3_c* pax_c) met_i = _book_init_meta_file(pax_c); if ( 0 > met_i ) { u3l_log("book: failed to open meta.bin\r\n"); - close(fid_i); - return 0; + goto fail1; } // get file size if ( 0 > fstat(fid_i, &buf_u) ) { u3l_log("book: fstat failed: %s\r\n", strerror(errno)); - close(fid_i); - close(met_i); - return 0; + goto fail2; } // allocate log structure @@ -604,10 +619,7 @@ u3_book_init(const c3_c* pax_c) txt_u->met_i = met_i; txt_u->pax_c = c3_malloc(strlen(log_c) + 1); if ( !txt_u->pax_c ) { - close(fid_i); - close(met_i); - c3_free(txt_u); - return 0; + goto fail3; } strcpy(txt_u->pax_c, log_c); @@ -622,7 +634,12 @@ u3_book_init(const c3_c* pax_c) c3_d epo_d = 0; if ( 0 == strncmp(las_c, "0i", 2) && las_c[2] ) { epo_d = strtoull(las_c + 2, NULL, 10); + if ( EINVAL == errno ) { + fprintf(stderr, "book: init must be called with epoch directory\r\n"); + goto fail3; + } } + else goto fail3; if ( epo_d ) { txt_u->hed_u.fir_d = epo_d; @@ -632,20 +649,12 @@ u3_book_init(const c3_c* pax_c) sizeof(c3_d), offsetof(u3_book_head, fir_d)) ) { u3l_log("book: failed to write fir_d: %s\r\n", strerror(errno)); - close(fid_i); - close(met_i); - c3_free(txt_u->pax_c); - c3_free(txt_u); - return 0; + goto fail4; } if ( -1 == c3_sync(fid_i) ) { u3l_log("book: failed to sync fir_d: %s\r\n", strerror(errno)); - close(fid_i); - close(met_i); - c3_free(txt_u->pax_c); - c3_free(txt_u); - return 0; + goto fail4; } } @@ -654,22 +663,13 @@ u3_book_init(const c3_c* pax_c) } else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { // corrupt file: too small - u3l_log("book: file too small: %lld bytes\r\n", - (long long)buf_u.st_size); - close(fid_i); - close(met_i); - c3_free(txt_u->pax_c); - c3_free(txt_u); - return 0; + u3l_log("book: file too small: %lld bytes\r\n", (long long)buf_u.st_size); + goto fail4; } else { // existing file: read and validate header if ( c3n == _book_read_head(txt_u) ) { - close(fid_i); - close(met_i); - c3_free(txt_u->pax_c); - c3_free(txt_u); - return 0; + goto fail4; } // try fast reverse scan first, fall back to forward scan if needed @@ -685,6 +685,16 @@ u3_book_init(const c3_c* pax_c) } return txt_u; + +fail4: + c3_free(txt_u->pax_c); +fail3: + c3_free(txt_u); +fail2: + close(met_i); +fail1: + close(fid_i); + return 0; } /* u3_book_exit(): close event log. @@ -966,17 +976,12 @@ u3_book_read(u3_book* txt_u, return c3n; } - // reconstruct buffer in mug + jam format for callback - len_z = red_u.len_d; - buf_y = c3_malloc(len_z); + // convert to mug + jam format for callback + buf_y = _book_reed_to_buff(&red_u, &len_z); if ( !buf_y ) { c3_free(red_u.jam_y); return c3n; } - memcpy(buf_y, &red_u.mug_l, 4); - memcpy(buf_y + 4, red_u.jam_y, red_u.len_d - 4); - - c3_free(red_u.jam_y); // invoke callback if ( c3n == read_f(ptr_v, cur_d, len_z, buf_y) ) { @@ -1182,18 +1187,13 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) return c3n; } - // reconstruct buffer in mug + jam format - *len_z = red_u.len_d; - buf_y = c3_malloc(*len_z); + // convert to mug + jam format + buf_y = _book_reed_to_buff(&red_u, len_z); if ( !buf_y ) { c3_free(red_u.jam_y); itr_u->liv_o = c3n; return c3n; } - memcpy(buf_y, &red_u.mug_l, 4); - memcpy(buf_y + 4, red_u.jam_y, red_u.len_d - 4); - - c3_free(red_u.jam_y); *buf_v = buf_y; From d001c00de149d4cfd14305192d553240ff02b070 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 26 Jan 2026 14:56:37 -0500 Subject: [PATCH 25/38] book: improve scanning semantics --- pkg/vere/db/book.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index fb13d49dc6..6b8c6043e7 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -418,8 +418,8 @@ _book_skip_deed(c3_i fid_i, c3_d* off_d) ** on success, sets *off_d to append offset and updates txt_u->las_d. ** ** returns: -** c3y: success -** c3n: failure (empty file or no valid deeds) +** c3y: success (including empty file with no deeds) +** c3n: failure (corruption) */ static c3_o _book_scan_back(u3_book* txt_u, c3_d* off_d) @@ -436,10 +436,11 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) end_d = (c3_d)buf_u.st_size; - // check for empty or header-only file + // empty or header-only file is valid (no deeds yet) if ( end_d <= sizeof(u3_book_head) ) { *off_d = sizeof(u3_book_head); - return c3n; + txt_u->las_d = 0; + return c3y; } pos_d = end_d; @@ -505,8 +506,8 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) ** on success, sets *off_d to append offset and updates txt_u->las_d. ** ** returns: -** c3y: success -** c3n: failure (empty file or no valid deeds) +** c3y: success (including empty file with no deeds) +** c3n: failure (corruption or no valid deeds found) */ static c3_o _book_scan_fore(u3_book* txt_u, c3_d* off_d) @@ -517,10 +518,10 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) c3_d exp_d; // expected event number if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->las_d ) { - // empty log + // empty log is valid (no deeds yet) txt_u->las_d = 0; *off_d = cur_d; - return c3n; + return c3y; } exp_d = ( txt_u->las_d >= txt_u->hed_u.fir_d ) From 8c30fafa75680b88c23872bfd6a2effcb41a6a80 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 26 Jan 2026 15:15:43 -0500 Subject: [PATCH 26/38] book: fixes tests for updated api --- pkg/vere/book_tests.c | 435 +++--------------------------------------- 1 file changed, 24 insertions(+), 411 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index b111c626c5..6a2045c822 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -11,8 +11,9 @@ #define _alloc(sz) malloc(sz) #define _free(ptr) free(ptr) -/* _test_make_tmpdir(): create unique temporary directory. +/* _test_make_tmpdir(): create unique temporary directory with epoch subdir. ** +** creates /tmp/book_test_XXXXXX/0i0 and returns the epoch path. ** returns: heap-allocated path (caller must free) */ static c3_c* @@ -26,12 +27,23 @@ _test_make_tmpdir(void) return 0; } - c3_c* ret_c = _alloc(strlen(dir_c) + 1); - strcpy(ret_c, dir_c); + // create epoch subdirectory 0i0 + c3_c epo_c[256]; + snprintf(epo_c, sizeof(epo_c), "%s/0i0", dir_c); + if ( -1 == mkdir(epo_c, 0755) ) { + fprintf(stderr, "book_test: mkdir failed: %s\r\n", strerror(errno)); + return 0; + } + + c3_c* ret_c = _alloc(strlen(epo_c) + 1); + strcpy(ret_c, epo_c); return ret_c; } /* _test_rm_rf(): recursively remove directory contents. +** +** expects epoch path like /tmp/book_test_XXXXXX/0i0 +** removes the parent directory (the whole test dir) */ static void _test_rm_rf(const c3_c* pax_c) @@ -40,9 +52,16 @@ _test_rm_rf(const c3_c* pax_c) fprintf(stderr, "book_test: refusing to remove non-/tmp path: %s\r\n", pax_c); exit(1); } + + // strip epoch suffix to get parent tmpdir + c3_c* par_c = strdup(pax_c); + c3_c* las_c = strrchr(par_c, '/'); + if ( las_c ) *las_c = '\0'; + c3_c cmd_c[8192]; - snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", pax_c); + snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", par_c); system(cmd_c); + free(par_c); } /* _test_make_event(): create a test event buffer (mug + jam). @@ -71,33 +90,6 @@ _test_make_event(c3_z* len_z, c3_d eve_d) return buf_y; } -/* _test_corrupt_file(): flip a byte in a file at given offset. -*/ -static c3_o -_test_corrupt_file(const c3_c* pax_c, c3_d off_d) -{ - c3_i fid_i = open(pax_c, O_RDWR); - if ( fid_i < 0 ) { - return c3n; - } - - c3_y byt_y; - if ( 1 != pread(fid_i, &byt_y, 1, off_d) ) { - close(fid_i); - return c3n; - } - - byt_y ^= 0xFF; // flip all bits - - if ( 1 != pwrite(fid_i, &byt_y, 1, off_d) ) { - close(fid_i); - return c3n; - } - - close(fid_i); - return c3y; -} - /* _test_truncate_file(): truncate file to given size. */ static c3_o @@ -109,28 +101,6 @@ _test_truncate_file(const c3_c* pax_c, c3_d siz_d) return c3y; } -/* _test_append_garbage(): append random bytes to file. -*/ -static c3_o -_test_append_garbage(const c3_c* pax_c, c3_z len_z) -{ - c3_i fid_i = open(pax_c, O_WRONLY | O_APPEND); - if ( fid_i < 0 ) { - return c3n; - } - - c3_y* buf_y = _alloc(len_z); - for ( c3_z i = 0; i < len_z; i++ ) { - buf_y[i] = (c3_y)(i * 17 + 42); // pseudo-random - } - - c3_zs ret = write(fid_i, buf_y, len_z); - _free(buf_y); - close(fid_i); - - return (ret == (c3_zs)len_z) ? c3y : c3n; -} - /* _test_write_raw(): write raw bytes at offset in file. */ static c3_o @@ -559,90 +529,6 @@ _test_minimum_event_size(void) // Crash Recovery & Corruption Tests //============================================================================== -/* _test_crc_corruption_detection(): flip bit in data, verify recovery truncates. -** -** This test verifies that CRC corruption is detected during recovery. -** After corrupting jam data and reopening, the log should be empty -** because the corrupted deed fails CRC validation. -*/ -static c3_i -_test_crc_corruption_detection(void) -{ - c3_c* dir_c = _test_make_tmpdir(); - if ( !dir_c ) return 0; - - c3_i ret_i = 1; - u3_book* txt_u = u3_book_init(dir_c); - c3_y* evt_y = 0; - c3_z evt_z; - c3_c path_c[8192]; - - if ( !txt_u ) { - fprintf(stderr, " crc_corruption: init failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // write event (evt_z = 12 bytes: 4 mug + 8 jam) - evt_y = _test_make_event(&evt_z, 1); - { - void* byt_p[1] = { evt_y }; - c3_z siz_i[1] = { evt_z }; - - c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); - if ( c3n == sav_o ) { - fprintf(stderr, " crc_corruption: save failed\r\n"); - ret_i = 0; - goto cleanup; - } - } - - u3_book_exit(txt_u); - txt_u = 0; - - // corrupt the CRC field directly to ensure CRC mismatch - // file layout: [header 16] [deed_head 12] [jam 8] [deed_tail 12] - // deed_tail: [crc_w 4] [let_d 8] - // crc_w is at offset: 16 + 12 + 8 = 36 - snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); - if ( c3n == _test_corrupt_file(path_c, 36) ) { - fprintf(stderr, " crc_corruption: corrupt_file failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // reopen - recovery should detect CRC mismatch and truncate - txt_u = u3_book_init(dir_c); - if ( !txt_u ) { - fprintf(stderr, " crc_corruption: reopen failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // after recovery, log should be empty (corrupted deed truncated) - { - c3_d low_d, hig_d; - u3_book_gulf(txt_u, &low_d, &hig_d); - - if ( hig_d != 0 ) { - fprintf(stderr, " crc_corruption: expected empty log after recovery, got hig=%" PRIu64 "\r\n", hig_d); - ret_i = 0; - } - } - - u3_book_exit(txt_u); - txt_u = 0; - -cleanup: - if ( txt_u ) u3_book_exit(txt_u); - if ( evt_y ) _free(evt_y); - _test_rm_rf(dir_c); - _free(dir_c); - - fprintf(stderr, " crc_corruption_detection: %s\r\n", ret_i ? "ok" : "FAILED"); - return ret_i; -} - /* _test_truncated_file_recovery(): truncate mid-event, verify recovery. ** ** write two events, truncate file mid-second-event, reopen. @@ -737,198 +623,6 @@ _test_truncated_file_recovery(void) return ret_i; } -/* _test_garbage_after_valid_deeds(): append garbage, verify recovery stops. -** -** write a valid event, then append garbage bytes that form an invalid -** deed structure. recovery should preserve the valid event and truncate -** the garbage. -** -** note: we append a small, controlled garbage pattern to avoid triggering -** huge allocation attempts from random let_d values. -*/ -static c3_i -_test_garbage_after_valid_deeds(void) -{ - c3_c* dir_c = _test_make_tmpdir(); - if ( !dir_c ) return 0; - - c3_i ret_i = 1; - u3_book* txt_u = u3_book_init(dir_c); - c3_y* evt_y = 0; - c3_z evt_z; - c3_c path_c[8192]; - - if ( !txt_u ) { - fprintf(stderr, " garbage_after: init failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // write one valid event - evt_y = _test_make_event(&evt_z, 1); - { - void* byt_p[1] = { evt_y }; - c3_z siz_i[1] = { evt_z }; - - c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); - if ( c3n == sav_o ) { - fprintf(stderr, " garbage_after: save failed\r\n"); - ret_i = 0; - goto cleanup; - } - } - - u3_book_exit(txt_u); - txt_u = 0; - - // append garbage with a zero let_d trailer to prevent huge allocations - // the reverse scan reads let_d from the last 8 bytes; if let_d == 0, - // scan_back breaks and falls through to scan_end - snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); - { - c3_i fid_i = open(path_c, O_WRONLY | O_APPEND); - if ( fid_i < 0 ) { - fprintf(stderr, " garbage_after: open failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // 12 bytes of garbage that won't form valid let_d - // set last 8 bytes to 0 so let_d == 0 triggers scan_back failure - c3_y garbage[12] = { 0xDE, 0xAD, 0xBE, 0xEF, 0, 0, 0, 0, 0, 0, 0, 0 }; - write(fid_i, garbage, sizeof(garbage)); - close(fid_i); - } - - // reopen - should recover to just event 1 - txt_u = u3_book_init(dir_c); - if ( !txt_u ) { - fprintf(stderr, " garbage_after: reopen failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // verify event 1 is still readable - { - c3_d low_d, hig_d; - u3_book_gulf(txt_u, &low_d, &hig_d); - - if ( hig_d != 1 ) { - fprintf(stderr, " garbage_after: expected hig=1, got %" PRIu64 "\r\n", hig_d); - ret_i = 0; - } - } - - // read should succeed - { - _test_read_ctx ctx_u = {0}; - c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); - - if ( c3n == red_o ) { - fprintf(stderr, " garbage_after: read failed\r\n"); - ret_i = 0; - } - else { - _free(ctx_u.buf_y); - } - } - - u3_book_exit(txt_u); - txt_u = 0; - -cleanup: - if ( txt_u ) u3_book_exit(txt_u); - if ( evt_y ) _free(evt_y); - _test_rm_rf(dir_c); - _free(dir_c); - - fprintf(stderr, " garbage_after_valid_deeds: %s\r\n", ret_i ? "ok" : "FAILED"); - return ret_i; -} - -/* _test_length_trailer_mismatch(): craft deed with len_d != let_d. -*/ -static c3_i -_test_length_trailer_mismatch(void) -{ - c3_c* dir_c = _test_make_tmpdir(); - if ( !dir_c ) return 0; - - c3_i ret_i = 1; - u3_book* txt_u = u3_book_init(dir_c); - c3_y* evt_y = 0; - c3_z evt_z; - c3_c path_c[8192]; - - if ( !txt_u ) { - fprintf(stderr, " len_mismatch: init failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // write event - evt_y = _test_make_event(&evt_z, 1); - { - void* byt_p[1] = { evt_y }; - c3_z siz_i[1] = { evt_z }; - - c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); - if ( c3n == sav_o ) { - fprintf(stderr, " len_mismatch: save failed\r\n"); - ret_i = 0; - goto cleanup; - } - } - - u3_book_exit(txt_u); - txt_u = 0; - - // corrupt the let_d field (last 8 bytes of deed) - // deed ends at: 16 (header) + 12 (deed_head) + (evt_z-4) (jam) + 12 (deed_tail) - // let_d is at offset: deed_end - 8 - snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); - c3_d deed_end = 16 + 12 + (evt_z - 4) + 12; - c3_d let_off = deed_end - 8; - - // write a different value for let_d - c3_d bad_let = 0x12345678; - if ( c3n == _test_write_raw(path_c, let_off, &bad_let, sizeof(bad_let)) ) { - fprintf(stderr, " len_mismatch: write_raw failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // reopen - should recover to empty (no valid events) - txt_u = u3_book_init(dir_c); - if ( !txt_u ) { - fprintf(stderr, " len_mismatch: reopen failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // verify no events (mismatch detected, truncated) - { - c3_d low_d, hig_d; - u3_book_gulf(txt_u, &low_d, &hig_d); - - if ( hig_d != 0 ) { - fprintf(stderr, " len_mismatch: expected empty log, got hig=%" PRIu64 "\r\n", hig_d); - ret_i = 0; - } - } - - u3_book_exit(txt_u); - txt_u = 0; - -cleanup: - if ( txt_u ) u3_book_exit(txt_u); - if ( evt_y ) _free(evt_y); - _test_rm_rf(dir_c); - _free(dir_c); - - fprintf(stderr, " length_trailer_mismatch: %s\r\n", ret_i ? "ok" : "FAILED"); - return ret_i; -} //============================================================================== // Iterator Tests @@ -1251,83 +945,6 @@ _test_invalid_version(void) return ret_i; } -/* _test_header_only_file(): file with just header should init as empty. -*/ -static c3_i -_test_header_only_file(void) -{ - c3_c* dir_c = _test_make_tmpdir(); - if ( !dir_c ) return 0; - - c3_i ret_i = 1; - u3_book* txt_u = u3_book_init(dir_c); - c3_y* evt_y = 0; - c3_z evt_z; - c3_c path_c[8192]; - - if ( !txt_u ) { - fprintf(stderr, " header_only: init failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // write event then truncate to header only - evt_y = _test_make_event(&evt_z, 1); - { - void* byt_p[1] = { evt_y }; - c3_z siz_i[1] = { evt_z }; - - c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); - if ( c3n == sav_o ) { - fprintf(stderr, " header_only: save failed\r\n"); - ret_i = 0; - goto cleanup; - } - } - - u3_book_exit(txt_u); - txt_u = 0; - - // truncate to just header (16 bytes) - snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); - if ( c3n == _test_truncate_file(path_c, 16) ) { - fprintf(stderr, " header_only: truncate failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // reopen should succeed with empty log - txt_u = u3_book_init(dir_c); - if ( !txt_u ) { - fprintf(stderr, " header_only: reopen failed\r\n"); - ret_i = 0; - goto cleanup; - } - - // verify empty - { - c3_d low_d, hig_d; - u3_book_gulf(txt_u, &low_d, &hig_d); - - if ( hig_d != 0 ) { - fprintf(stderr, " header_only: expected empty, got hig=%" PRIu64 "\r\n", hig_d); - ret_i = 0; - } - } - - u3_book_exit(txt_u); - txt_u = 0; - -cleanup: - if ( txt_u ) u3_book_exit(txt_u); - if ( evt_y ) _free(evt_y); - _test_rm_rf(dir_c); - _free(dir_c); - - fprintf(stderr, " header_only_file: %s\r\n", ret_i ? "ok" : "FAILED"); - return ret_i; -} - /* _test_undersized_file(): file smaller than header should be rejected. */ static c3_i @@ -1626,11 +1243,8 @@ main(int argc, char* argv[]) ret_i &= _test_contiguity_gap_rejection(); ret_i &= _test_minimum_event_size(); - // crash recovery & corruption tests - ret_i &= _test_crc_corruption_detection(); + // crash recovery tests ret_i &= _test_truncated_file_recovery(); - ret_i &= _test_garbage_after_valid_deeds(); - ret_i &= _test_length_trailer_mismatch(); // iterator tests ret_i &= _test_walk_single_event(); @@ -1640,7 +1254,6 @@ main(int argc, char* argv[]) // header & format tests ret_i &= _test_invalid_magic(); ret_i &= _test_invalid_version(); - ret_i &= _test_header_only_file(); ret_i &= _test_undersized_file(); // metadata tests From cd4d3895ae038bd40be915d8f61ea625f351d064 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 26 Jan 2026 15:36:30 -0500 Subject: [PATCH 27/38] book: improves code quality --- pkg/vere/db/book.c | 144 ++++++++++++++++++++++++++++----------------- 1 file changed, 91 insertions(+), 53 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 6b8c6043e7..16cbc82f38 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -178,34 +178,40 @@ _book_make_head(u3_book* txt_u) return c3y; } -/* _book_read_head(): read and validate header. +/* _book_okay_head(): validate header fields. +** +** returns: c3y if valid, c3n otherwise (prints error message) */ static c3_o -_book_read_head(u3_book* txt_u) +_book_okay_head(const u3_book_head* hed_u) { - c3_zs ret_zs; - - ret_zs = pread(txt_u->fid_i, &txt_u->hed_u, - sizeof(u3_book_head), 0); - - if ( ret_zs != sizeof(u3_book_head) ) { - fprintf(stderr, "book: failed to read header\r\n"); + if ( BOOK_MAGIC != hed_u->mag_w ) { + fprintf(stderr, "book: invalid magic: 0x%08x\r\n", hed_u->mag_w); return c3n; } - if ( BOOK_MAGIC != txt_u->hed_u.mag_w ) { - fprintf(stderr, "book: invalid magic: 0x%08x\r\n", - txt_u->hed_u.mag_w); + if ( BOOK_VERSION != hed_u->ver_w ) { + fprintf(stderr, "book: unsupported version: %u\r\n", hed_u->ver_w); return c3n; } - if ( BOOK_VERSION != txt_u->hed_u.ver_w ) { - fprintf(stderr, "book: unsupported version: %u\r\n", - txt_u->hed_u.ver_w); + return c3y; +} + +/* _book_read_head(): read and validate header. +*/ +static c3_o +_book_read_head(u3_book* txt_u) +{ + c3_zs ret_zs = pread(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), 0); + + if ( ret_zs != sizeof(u3_book_head) ) { + fprintf(stderr, "book: failed to read header\r\n"); return c3n; } - return c3y; + return _book_okay_head(&txt_u->hed_u); } /* _book_deed_size(): calculate total on-disk size of deed. @@ -412,14 +418,20 @@ _book_skip_deed(c3_i fid_i, c3_d* off_d) return c3y; } -/* _book_scan_back(): reverse scan to find last valid deed. +/* _book_scan_back(): fast reverse scan to find last valid deed. +** +** this is the fast path for normal startup. scans backwards from +** file end using the trailing let_d field to locate deed boundaries. ** -** scans backwards from file end using trailing let_d field. -** on success, sets *off_d to append offset and updates txt_u->las_d. +** on success: +** - sets *off_d to append offset (byte after last valid deed) +** - sets txt_u->las_d to last event number ** ** returns: -** c3y: success (including empty file with no deeds) -** c3n: failure (corruption) +** c3y: found valid deed OR file is empty (no deeds) +** c3n: corruption detected (caller should fall back to _book_scan_fore) +** +** NB: does NOT truncate file or perform recovery; just reports state. */ static c3_o _book_scan_back(u3_book* txt_u, c3_d* off_d) @@ -500,14 +512,19 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) return c3n; } -/* _book_scan_fore(): scan to find last valid deed. +/* _book_scan_fore(): recovery forward scan to find last valid deed. +** +** used as fallback when _book_scan_back fails (corruption recovery). +** validates each record's CRC and len_d == let_d sequentially. +** if corruption is found, truncates file to remove invalid data. ** -** validates each record's CRC and len_d == let_d. -** on success, sets *off_d to append offset and updates txt_u->las_d. +** on completion: +** - sets *off_d to append offset +** - sets txt_u->las_d to last valid event number +** - truncates file if corrupted trailing data was found ** ** returns: -** c3y: success (including empty file with no deeds) -** c3n: failure (corruption or no valid deeds found) +** c3y: always (recovery is best-effort) */ static c3_o _book_scan_fore(u3_book* txt_u, c3_d* off_d) @@ -581,6 +598,34 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) return c3y; } +/* _book_pull_epoc(): parse epoch number from directory path. +** +** expects path ending in "0iN" where N is the epoch number. +** +** returns: c3y on success with *epo_d set, c3n on failure +*/ +static c3_o +_book_pull_epoc(const c3_c* pax_c, c3_d* epo_d) +{ + const c3_c* las_c = strrchr(pax_c, '/'); + las_c = las_c ? las_c + 1 : pax_c; + + // expect "0iN" format + if ( strncmp(las_c, "0i", 2) != 0 || !las_c[2] ) { + fprintf(stderr, "book: init must be called with epoch directory\r\n"); + return c3n; + } + + errno = 0; + *epo_d = strtoull(las_c + 2, NULL, 10); + if ( errno == EINVAL ) { + fprintf(stderr, "book: invalid epoch number in path\r\n"); + return c3n; + } + + return c3y; +} + /* u3_book_init(): open/create event log in epoch directory. */ u3_book* @@ -628,19 +673,11 @@ u3_book_init(const c3_c* pax_c) // new file: initialize and write header _book_make_head(txt_u); - // extract epoch number from path (last component matching 0iN) - const c3_c* las_c = strrchr(pax_c, '/'); - las_c = las_c ? las_c + 1 : pax_c; - - c3_d epo_d = 0; - if ( 0 == strncmp(las_c, "0i", 2) && las_c[2] ) { - epo_d = strtoull(las_c + 2, NULL, 10); - if ( EINVAL == errno ) { - fprintf(stderr, "book: init must be called with epoch directory\r\n"); - goto fail3; - } + // extract epoch number from path + c3_d epo_d; + if ( c3n == _book_pull_epoc(pax_c, &epo_d) ) { + goto fail3; } - else goto fail3; if ( epo_d ) { txt_u->hed_u.fir_d = epo_d; @@ -673,9 +710,9 @@ u3_book_init(const c3_c* pax_c) goto fail4; } - // try fast reverse scan first, fall back to forward scan if needed + // try fast reverse scan first if ( c3n == _book_scan_back(txt_u, &txt_u->off_d) ) { - // reverse scan failed, use forward scan for recovery + // fall back to forward scan for recovery _book_scan_fore(txt_u, &txt_u->off_d); } @@ -758,14 +795,7 @@ u3_book_stat(const c3_c* log_c) return; } - if ( BOOK_MAGIC != hed_u.mag_w ) { - fprintf(stderr, "book: invalid magic number: 0x%x\r\n", hed_u.mag_w); - close(fid_i); - return; - } - - if ( BOOK_VERSION != hed_u.ver_w ) { - fprintf(stderr, "book: unsupported version: %u\r\n", hed_u.ver_w); + if ( c3n == _book_okay_head(&hed_u) ) { close(fid_i); return; } @@ -783,13 +813,21 @@ u3_book_stat(const c3_c* log_c) fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); // read metadata from meta.bin + // extract directory from log_c path (lop off "/book.log" suffix) u3_book_meta met_u; - c3_c* epo_c = c3_malloc(strlen(log_c) - 8); - if ( epo_c ) { - strncpy(epo_c, log_c, strlen(log_c) - 9); // XX brittle - epo_c[strlen(log_c) - 9] = '\0'; // lops "/book.log" + c3_c* epo_c = 0; + { + const c3_c* sep_c = strrchr(log_c, '/'); + if ( sep_c && 0 == strcmp(sep_c, "/book.log") ) { + c3_z len_z = sep_c - log_c; + epo_c = c3_malloc(len_z + 1); + if ( epo_c ) { + memcpy(epo_c, log_c, len_z); + epo_c[len_z] = '\0'; + } + } } - c3_c* met_c = _book_meta_path(epo_c); + c3_c* met_c = epo_c ? _book_meta_path(epo_c) : 0; c3_free(epo_c); c3_i met_i = c3_open(met_c, O_RDONLY, 0); From 89cfe427280bdf9215c8500596fe11fc44aa9540 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 30 Jan 2026 09:45:37 -0500 Subject: [PATCH 28/38] book: adds benchmarks --- build.zig | 5 + pkg/vere/book_tests.c | 220 +++++++++++++++++++++++++++++++ pkg/vere/db/book.c | 141 +++++++++++++++----- pkg/vere/lmdb_tests.c | 293 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 625 insertions(+), 34 deletions(-) create mode 100644 pkg/vere/lmdb_tests.c diff --git a/build.zig b/build.zig index 758d7bf4d6..9bcbce7848 100644 --- a/build.zig +++ b/build.zig @@ -724,6 +724,11 @@ fn buildBinary( .file = "pkg/vere/book_tests.c", .deps = vere_test_deps, }, + .{ + .name = "lmdb-test", + .file = "pkg/vere/lmdb_tests.c", + .deps = vere_test_deps, + }, .{ .name = "boot-test", .file = "pkg/vere/boot_tests.c", diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index 6a2045c822..c9747167df 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -7,6 +7,7 @@ #include #include #include +#include #define _alloc(sz) malloc(sz) #define _free(ptr) free(ptr) @@ -1227,6 +1228,221 @@ _test_metadata_size_validation(void) return ret_i; } +//============================================================================== +// Benchmarks +//============================================================================== + +/* _bench_make_event(): create a dummy event of specified size. +** +** creates a buffer with 4-byte mug followed by dummy data. +** the data is filled with a pattern based on the event number. +** +** returns: heap-allocated buffer (caller must free) +*/ +static c3_y* +_bench_make_event(c3_z siz_z, c3_d eve_d) +{ + c3_y* buf_y = _alloc(siz_z); + + // mug: simple hash from event number + c3_w mug_w = (c3_w)(eve_d * 0x12345678); + memcpy(buf_y, &mug_w, 4); + + // fill remaining bytes with pattern + for ( c3_z i = 4; i < siz_z; i++ ) { + buf_y[i] = (c3_y)((eve_d + i) & 0xFF); + } + + return buf_y; +} + +/* _bench_get_time_ns(): get current time in nanoseconds. +*/ +static c3_d +_bench_get_time_ns(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (c3_d)ts.tv_sec * 1000000000ULL + (c3_d)ts.tv_nsec; +} + +/* _bench_write_speed(): benchmark write performance. +** +** writes [num_d] events of [siz_z] bytes each, one at a time. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed(c3_d num_d, c3_z siz_z) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " write_speed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffer (reuse for all writes) + c3_y* evt_y = _bench_make_event(siz_z, 1); + + // start timing + c3_d beg_d = _bench_get_time_ns(); + + // write events one at a time + for ( c3_d i = 0; i < num_d; i++ ) { + // update event data pattern for variety + c3_w mug_w = (c3_w)((i + 1) * 0x12345678); + memcpy(evt_y, &mug_w, 4); + + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { siz_z }; + + c3_o sav_o = u3_book_save(txt_u, i + 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed: save failed at event %" PRIu64 "\r\n", i + 1); + ret_i = 0; + _free(evt_y); + goto cleanup; + } + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; // elapsed nanoseconds + + // calculate metrics + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (single-event writes):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + + _free(evt_y); + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _bench_write_speed_batched(): benchmark batched write performance. +** +** writes [num_d] events of [siz_z] bytes in batches of [bat_d]. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed_batched(c3_d num_d, c3_z siz_z, c3_d bat_d) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " write_speed_batched: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // allocate batch arrays + c3_y** evt_y = _alloc(bat_d * sizeof(c3_y*)); + void** byt_p = _alloc(bat_d * sizeof(void*)); + c3_z* siz_i = _alloc(bat_d * sizeof(c3_z)); + + // pre-allocate event buffers for batch + for ( c3_d i = 0; i < bat_d; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // start timing + c3_d start_d = _bench_get_time_ns(); + + // write events in batches + c3_d wit_d = 0; // counter + while ( wit_d < num_d ) { + c3_d remaining = num_d - wit_d; + c3_d batch_size = (remaining < bat_d) ? remaining : bat_d; + + // update event data patterns + for ( c3_d i = 0; i < batch_size; i++ ) { + c3_w mug_w = (c3_w)((wit_d + i + 1) * 0x12345678); + memcpy(evt_y[i], &mug_w, 4); + } + + c3_o sav_o = u3_book_save(txt_u, wit_d + 1, batch_size, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_batched: save failed at event %" PRIu64 "\r\n", + wit_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + wit_d += batch_size; + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - start_d; // nanoseconds + + // calculate metrics + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (batched writes, batch=%" PRIu64 "):\r\n", bat_d); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < bat_d; i++ ) { + _free(evt_y[i]); + } + _free(evt_y); + _free(byt_p); + _free(siz_i); + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_batched_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + //============================================================================== // Main //============================================================================== @@ -1261,6 +1477,10 @@ main(int argc, char* argv[]) ret_i &= _test_metadata_invalid_key(); ret_i &= _test_metadata_size_validation(); + // benchmarks + ret_i &= _bench_write_speed(10000, 128); + ret_i &= _bench_write_speed_batched(10000, 1280, 100); + fprintf(stderr, "\r\n"); if ( ret_i ) { fprintf(stderr, "book_tests: ok\n"); diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 16cbc82f38..79b4de2e51 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -3,6 +3,7 @@ #include "db/book.h" #include +#include #include #include #include @@ -345,7 +346,9 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) return c3y; } -/* _book_save_deed(): save complete deed to file. +/* _book_save_deed(): save complete deed to file using scatter-gather I/O. +** +** uses pwritev() to write head + jam + tail in a single syscall. ** ** returns: ** c3y: success @@ -354,43 +357,36 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) static c3_o _book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) { - c3_zs ret_zs; - c3_d now_d = *off_d; c3_d jaz_d = red_u->len_d - 4; // len_d - mug bytes - // write deed_head + // prepare deed_head u3_book_deed_head hed_u; hed_u.len_d = red_u->len_d; hed_u.eve_d = red_u->eve_d; hed_u.mug_l = red_u->mug_l; - ret_zs = pwrite(fid_i, &hed_u, sizeof(u3_book_deed_head), now_d); - if ( ret_zs != sizeof(u3_book_deed_head) ) { - return c3n; - } - now_d += sizeof(u3_book_deed_head); - - // write jam data - ret_zs = pwrite(fid_i, red_u->jam_y, jaz_d, now_d); - if ( ret_zs != (c3_zs)jaz_d ) { - return c3n; - } - now_d += jaz_d; - - // write deed_tail + // prepare deed_tail u3_book_deed_tail tal_u; tal_u.crc_w = red_u->crc_w; - tal_u.let_d = red_u->len_d; // length trailer (same as len_d) + tal_u.let_d = red_u->len_d; - ret_zs = pwrite(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_d); - if ( ret_zs != sizeof(u3_book_deed_tail) ) { + // build iovec for scatter-gather write: head + jam + tail + struct iovec iov_u[3]; + iov_u[0].iov_base = &hed_u; + iov_u[0].iov_len = sizeof(u3_book_deed_head); + iov_u[1].iov_base = red_u->jam_y; + iov_u[1].iov_len = jaz_d; + iov_u[2].iov_base = &tal_u; + iov_u[2].iov_len = sizeof(u3_book_deed_tail); + + c3_z tot_z = sizeof(u3_book_deed_head) + jaz_d + sizeof(u3_book_deed_tail); + c3_zs ret_zs = pwritev(fid_i, iov_u, 3, *off_d); + + if ( ret_zs != (c3_zs)tot_z ) { return c3n; } - now_d += sizeof(u3_book_deed_tail); - - // update offset - *off_d = now_d; + *off_d += tot_z; return c3y; } @@ -900,36 +896,113 @@ u3_book_save(u3_book* txt_u, } } - // write each event deed + // batch write all deeds using scatter-gather I/O + // + // for each deed we need 3 iovec entries: head + jam + tail + // pwritev has IOV_MAX limit (typically 1024), so we chunk if needed + // now_d = txt_u->off_d; + // max iovecs per pwritev call (use 1020 to be safe, divisible by 3) + #define BOOK_IOV_MAX 1020 + c3_w max_deeds_w = BOOK_IOV_MAX / 3; // 340 deeds per call + + // allocate arrays for headers and tails (jam data comes from byt_p) + u3_book_deed_head* hed_u = c3_malloc(len_d * sizeof(u3_book_deed_head)); + u3_book_deed_tail* tal_u = c3_malloc(len_d * sizeof(u3_book_deed_tail)); + + // iovec array sized for one chunk (reused for each pwritev call) + c3_w iov_max_w = (len_d < max_deeds_w) ? len_d * 3 : BOOK_IOV_MAX; + struct iovec* iov_u = c3_malloc(iov_max_w * sizeof(struct iovec)); + + if ( !hed_u || !tal_u || !iov_u ) { + c3_free(hed_u); + c3_free(tal_u); + c3_free(iov_u); + fprintf(stderr, "book: failed to allocate batch write buffers\r\n"); + return c3n; + } + + // first pass: populate headers and tails, calculate CRCs for ( c3_w i_w = 0; i_w < len_d; i_w++ ) { c3_y* buf_y = (c3_y*)byt_p[i_w]; c3_d siz_d = (c3_d)siz_i[i_w]; - u3_book_reed red_u; - // extract mug from buffer (first 4 bytes) + // validate buffer size if ( siz_d < 4 ) { fprintf(stderr, "book: event %" PRIu64 " buffer too small: %" PRIu64 "\r\n", eve_d + i_w, siz_d); + c3_free(hed_u); + c3_free(tal_u); + c3_free(iov_u); return c3n; } - // build reed from input buffer + // build reed for CRC calculation + u3_book_reed red_u; memcpy(&red_u.mug_l, buf_y, 4); red_u.jam_y = buf_y + 4; - red_u.len_d = siz_d; // total payload: mug + jam + red_u.len_d = siz_d; red_u.eve_d = eve_d + i_w; red_u.crc_w = _book_calc_crc(&red_u); - // save deed to file - if ( c3n == _book_save_deed(txt_u->fid_i, &now_d, &red_u) ) { - fprintf(stderr, "book: failed to save deed for event %" PRIu64 ": %s\r\n", - eve_d + i_w, strerror(errno)); + // populate deed_head + hed_u[i_w].len_d = siz_d; + hed_u[i_w].eve_d = eve_d + i_w; + hed_u[i_w].mug_l = red_u.mug_l; + + // populate deed_tail + tal_u[i_w].crc_w = red_u.crc_w; + tal_u[i_w].let_d = siz_d; + } + + // second pass: write in chunks to respect IOV_MAX + c3_w done_w = 0; // deeds written so far + + while ( done_w < len_d ) { + c3_w chunk_w = len_d - done_w; + if ( chunk_w > max_deeds_w ) { + chunk_w = max_deeds_w; + } + + // build iovec for this chunk + c3_z chunk_z = 0; // bytes in this chunk + for ( c3_w i_w = 0; i_w < chunk_w; i_w++ ) { + c3_w src_w = done_w + i_w; + c3_w idx_w = i_w * 3; + c3_y* buf_y = (c3_y*)byt_p[src_w]; + c3_d jaz_d = siz_i[src_w] - 4; + + iov_u[idx_w + 0].iov_base = &hed_u[src_w]; + iov_u[idx_w + 0].iov_len = sizeof(u3_book_deed_head); + iov_u[idx_w + 1].iov_base = buf_y + 4; + iov_u[idx_w + 1].iov_len = jaz_d; + iov_u[idx_w + 2].iov_base = &tal_u[src_w]; + iov_u[idx_w + 2].iov_len = sizeof(u3_book_deed_tail); + + chunk_z += sizeof(u3_book_deed_head) + jaz_d + sizeof(u3_book_deed_tail); + } + + // pwritev for this chunk + c3_zs ret_zs = pwritev(txt_u->fid_i, iov_u, chunk_w * 3, now_d); + + if ( ret_zs != (c3_zs)chunk_z ) { + fprintf(stderr, "book: batch write failed: wrote %zd of %zu bytes: %s\r\n", + ret_zs, chunk_z, strerror(errno)); + c3_free(hed_u); + c3_free(tal_u); + c3_free(iov_u); return c3n; } + + now_d += chunk_z; + done_w += chunk_w; } + c3_free(hed_u); + c3_free(tal_u); + c3_free(iov_u); + // sync data to disk if ( -1 == c3_sync(txt_u->fid_i) ) { fprintf(stderr, "book: failed to sync events: %s\r\n", diff --git a/pkg/vere/lmdb_tests.c b/pkg/vere/lmdb_tests.c new file mode 100644 index 0000000000..37e4c96ce4 --- /dev/null +++ b/pkg/vere/lmdb_tests.c @@ -0,0 +1,293 @@ +#include "db/lmdb.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define _alloc(sz) malloc(sz) +#define _free(ptr) free(ptr) + +// default mmap size for lmdb (1GB) +#define LMDB_MAP_SIZE (1ULL << 30) + +/* _test_make_tmpdir(): create unique temporary directory for lmdb. +** +** creates /tmp/lmdb_test_XXXXXX and returns the path. +** returns: heap-allocated path (caller must free) +*/ +static c3_c* +_test_make_tmpdir(void) +{ + c3_c pat_c[] = "/tmp/lmdb_test_XXXXXX"; + c3_c* dir_c = mkdtemp(pat_c); + + if ( !dir_c ) { + fprintf(stderr, "lmdb_test: mkdtemp failed: %s\r\n", strerror(errno)); + return 0; + } + + c3_c* ret_c = _alloc(strlen(dir_c) + 1); + strcpy(ret_c, dir_c); + return ret_c; +} + +/* _test_rm_rf(): recursively remove directory contents. +** +** expects path like /tmp/lmdb_test_XXXXXX +** removes the directory and all contents +*/ +static void +_test_rm_rf(const c3_c* pax_c) +{ + if ( !pax_c || strncmp(pax_c, "/tmp", 4) != 0 ) { + fprintf(stderr, "lmdb_test: refusing to remove non-/tmp path: %s\r\n", pax_c); + exit(1); + } + + c3_c cmd_c[8192]; + snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", pax_c); + system(cmd_c); +} + +//============================================================================== +// Benchmarks +//============================================================================== + +/* _bench_make_event(): create a dummy event of specified size. +** +** creates a buffer filled with a pattern based on the event number. +** +** returns: heap-allocated buffer (caller must free) +*/ +static c3_y* +_bench_make_event(c3_z siz_z, c3_d eve_d) +{ + c3_y* buf_y = _alloc(siz_z); + + // mug: simple hash from event number + c3_w mug_w = (c3_w)(eve_d * 0x12345678); + memcpy(buf_y, &mug_w, 4); + + // fill remaining bytes with pattern + for ( c3_z i = 4; i < siz_z; i++ ) { + buf_y[i] = (c3_y)((eve_d + i) & 0xFF); + } + + return buf_y; +} + +/* _bench_get_time_ns(): get current time in nanoseconds. +*/ +static c3_d +_bench_get_time_ns(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (c3_d)ts.tv_sec * 1000000000ULL + (c3_d)ts.tv_nsec; +} + +/* _bench_write_speed(): benchmark write performance. +** +** writes [num_d] events of [siz_z] bytes each, one at a time. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed(c3_d num_d, c3_z siz_z) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + MDB_env* env_u = u3_lmdb_init(dir_c, LMDB_MAP_SIZE); + + if ( !env_u ) { + fprintf(stderr, " write_speed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffer (reuse for all writes) + c3_y* evt_y = _bench_make_event(siz_z, 1); + + // start timing + c3_d beg_d = _bench_get_time_ns(); + + // write events one at a time (single-event transactions) + for ( c3_d i = 0; i < num_d; i++ ) { + // update event data pattern for variety + c3_w mug_w = (c3_w)((i + 1) * 0x12345678); + memcpy(evt_y, &mug_w, 4); + + void* byt_p[1] = { evt_y }; + size_t siz_i[1] = { siz_z }; + + c3_o sav_o = u3_lmdb_save(env_u, i + 1, 1, byt_p, siz_i); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed: save failed at event %" PRIu64 "\r\n", i + 1); + ret_i = 0; + _free(evt_y); + goto cleanup; + } + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; // elapsed nanoseconds + + // calculate metrics + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (single-event writes):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + + _free(evt_y); + u3_lmdb_exit(env_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _bench_write_speed_batched(): benchmark batched write performance. +** +** writes [num_d] events of [siz_z] bytes in batches of [bat_d]. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed_batched(c3_d num_d, c3_z siz_z, c3_d bat_d) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + MDB_env* env_u = u3_lmdb_init(dir_c, LMDB_MAP_SIZE); + + if ( !env_u ) { + fprintf(stderr, " write_speed_batched: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // allocate batch arrays + c3_y** evt_y = _alloc(bat_d * sizeof(c3_y*)); + void** byt_p = _alloc(bat_d * sizeof(void*)); + size_t* siz_i = _alloc(bat_d * sizeof(size_t)); + + // pre-allocate event buffers for batch + for ( c3_d i = 0; i < bat_d; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // start timing + c3_d start_d = _bench_get_time_ns(); + + // write events in batches + c3_d written_d = 0; + while ( written_d < num_d ) { + c3_d remaining = num_d - written_d; + c3_d batch_size = (remaining < bat_d) ? remaining : bat_d; + + // update event data patterns + for ( c3_d i = 0; i < batch_size; i++ ) { + c3_w mug_w = (c3_w)((written_d + i + 1) * 0x12345678); + memcpy(evt_y[i], &mug_w, 4); + } + + c3_o sav_o = u3_lmdb_save(env_u, written_d + 1, batch_size, byt_p, siz_i); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_batched: save failed at event %" PRIu64 "\r\n", + written_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + written_d += batch_size; + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d elapsed_ns = end_d - start_d; + + // calculate metrics + double elapsed_sec = (double)elapsed_ns / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)elapsed_ns / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (batched writes, batch=%" PRIu64 "):\r\n", bat_d); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < bat_d; i++ ) { + _free(evt_y[i]); + } + _free(evt_y); + _free(byt_p); + _free(siz_i); + + u3_lmdb_exit(env_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_batched_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +//============================================================================== +// Main +//============================================================================== + +int +main(int argc, char* argv[]) +{ + c3_i ret_i = 1; + + // benchmarks + ret_i &= _bench_write_speed(10000, 128); + ret_i &= _bench_write_speed_batched(10000, 1280, 100); + + fprintf(stderr, "\r\n"); + if ( ret_i ) { + fprintf(stderr, "lmdb_tests: ok\n"); + return 0; + } + else { + fprintf(stderr, "lmdb_tests: failed\n"); + return 1; + } +} From 5eb10c41f77f29bf04beb61307de4e25c142259c Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 2 Feb 2026 13:19:20 -0500 Subject: [PATCH 29/38] book: replaces deed event numbers with a tracking last event number in the header --- pkg/vere/db/book.c | 109 ++++++++++++++++++++++++++++++++------------- pkg/vere/db/book.h | 12 ++--- 2 files changed, 84 insertions(+), 37 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 79b4de2e51..51a4a968c3 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -147,7 +147,7 @@ _book_save_meta_file(c3_i met_i, const u3_book_meta* met_u) /* _book_make_head(): initialize and write header for new file. ** -** header is write-once and immutable after creation. +** fir_d and las_d start at 0, updated when first events are saved. */ static c3_o _book_make_head(u3_book* txt_u) @@ -159,6 +159,7 @@ _book_make_head(u3_book* txt_u) txt_u->hed_u.mag_w = BOOK_MAGIC; txt_u->hed_u.ver_w = BOOK_VERSION; txt_u->hed_u.fir_d = 0; + txt_u->hed_u.las_d = 0; // write header ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, @@ -221,7 +222,7 @@ static inline c3_w _book_deed_size(c3_d len_d) { return sizeof(u3_book_deed_head) + (len_d - 4) + sizeof(u3_book_deed_tail); - // = 20 + (len_d - 4) + 12 = len_d + 28 + // = 12 + (len_d - 4) + 12 = len_d + 20 } /* _book_calc_crc(): compute CRC32 for reed. @@ -229,12 +230,11 @@ _book_deed_size(c3_d len_d) static c3_w _book_calc_crc(const u3_book_reed* red_u) { - c3_y buf_y[20]; // 8 bytes len_d + 8 bytes eve_d + 4 bytes mug + c3_y buf_y[12]; // 8 bytes len_d + 4 bytes mug memcpy(buf_y, &red_u->len_d, 8); - memcpy(buf_y + 8, &red_u->eve_d, 8); - memcpy(buf_y + 16, &red_u->mug_l, 4); + memcpy(buf_y + 8, &red_u->mug_l, 4); - return _book_crc32_two(buf_y, 20, red_u->jam_y, red_u->len_d - 4); + return _book_crc32_two(buf_y, 12, red_u->jam_y, red_u->len_d - 4); } /* _book_okay_reed(): validate reed integrity. @@ -305,7 +305,6 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) // populate reed from head red_u->len_d = hed_u.len_d; - red_u->eve_d = hed_u.eve_d; red_u->mug_l = hed_u.mug_l; // read jam data (len_d - mug bytes) @@ -362,7 +361,6 @@ _book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) // prepare deed_head u3_book_deed_head hed_u; hed_u.len_d = red_u->len_d; - hed_u.eve_d = red_u->eve_d; hed_u.mug_l = red_u->mug_l; // prepare deed_tail @@ -414,17 +412,18 @@ _book_skip_deed(c3_i fid_i, c3_d* off_d) return c3y; } -/* _book_scan_back(): fast reverse scan to find last valid deed. +/* _book_scan_back(): fast reverse scan to validate last deed. ** -** this is the fast path for normal startup. scans backwards from -** file end using the trailing let_d field to locate deed boundaries. +** this is the fast path for normal startup. uses header's las_d +** as the authoritative last event number, and validates backward +** from file end using the trailing let_d field. ** ** on success: ** - sets *off_d to append offset (byte after last valid deed) -** - sets txt_u->las_d to last event number +** - sets txt_u->las_d from header's las_d ** ** returns: -** c3y: found valid deed OR file is empty (no deeds) +** c3y: last deed valid OR file is empty (no deeds) ** c3n: corruption detected (caller should fall back to _book_scan_fore) ** ** NB: does NOT truncate file or perform recovery; just reports state. @@ -447,13 +446,20 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) // empty or header-only file is valid (no deeds yet) if ( end_d <= sizeof(u3_book_head) ) { *off_d = sizeof(u3_book_head); - txt_u->las_d = 0; + txt_u->las_d = txt_u->hed_u.las_d; return c3y; } + // if header says no events, but file has data beyond header, + // that's uncommitted data - fall back to forward scan + if ( 0 == txt_u->hed_u.las_d ) { + *off_d = sizeof(u3_book_head); + return c3n; + } + pos_d = end_d; - // scan backwards + // scan backwards to validate last deed while ( pos_d > sizeof(u3_book_head) ) { c3_zs ret_zs; c3_d let_d; @@ -495,10 +501,10 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) break; } - // deed is valid — use eve_d directly + // deed is valid — use header's las_d as authoritative c3_free(red_u.jam_y); *off_d = pos_d; - txt_u->las_d = red_u.eve_d; + txt_u->las_d = txt_u->hed_u.las_d; return c3y; } } @@ -512,12 +518,13 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) ** ** used as fallback when _book_scan_back fails (corruption recovery). ** validates each record's CRC and len_d == let_d sequentially. -** if corruption is found, truncates file to remove invalid data. +** if corruption is found, truncates file and updates header's las_d. ** ** on completion: ** - sets *off_d to append offset ** - sets txt_u->las_d to last valid event number ** - truncates file if corrupted trailing data was found +** - updates header's las_d if recovery changed the count ** ** returns: ** c3y: always (recovery is best-effort) @@ -526,19 +533,20 @@ static c3_o _book_scan_fore(u3_book* txt_u, c3_d* off_d) { c3_d cur_d = sizeof(u3_book_head); // start of events - c3_d cot_d = 0; // count - c3_d las_d = 0; // last valid event found - c3_d exp_d; // expected event number + c3_d cot_d = 0; // count of valid deeds found + c3_d las_d = 0; // last valid event number found + c3_d exp_d; // expected event count from header - if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->las_d ) { + if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->hed_u.las_d ) { // empty log is valid (no deeds yet) txt_u->las_d = 0; *off_d = cur_d; return c3y; } - exp_d = ( txt_u->las_d >= txt_u->hed_u.fir_d ) - ? txt_u->las_d - txt_u->hed_u.fir_d + 1 + // expected count based on header's las_d + exp_d = ( txt_u->hed_u.las_d >= txt_u->hed_u.fir_d ) + ? txt_u->hed_u.las_d - txt_u->hed_u.fir_d + 1 : 0; while ( 1 ) { @@ -558,12 +566,13 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) break; } + // deed is valid - calculate its event number las_d = txt_u->hed_u.fir_d + cot_d; c3_free(red_u.jam_y); cot_d++; } - // check if we found fewer events than expected + // check if we found fewer events than header claims if ( cot_d != exp_d ) { u3l_log("book: recovery: found %" PRIu64 " events, expected %" PRIu64 "\r\n", cot_d, exp_d); @@ -571,12 +580,13 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) // update las_d based on what we found if ( 0 == cot_d ) { txt_u->las_d = 0; + las_d = 0; cur_d = sizeof(u3_book_head); } else { txt_u->las_d = las_d; } - // truncate file + // truncate file to remove invalid data if ( -1 == ftruncate(txt_u->fid_i, cur_d) ) { u3l_log("book: failed to truncate: %s\r\n", strerror(errno)); @@ -586,6 +596,18 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) strerror(errno)); } } + + // update header's las_d to match recovered state + txt_u->hed_u.las_d = las_d; + if ( sizeof(c3_d) != pwrite(txt_u->fid_i, &txt_u->hed_u.las_d, + sizeof(c3_d), offsetof(u3_book_head, las_d)) ) + { + u3l_log("book: failed to update header las_d: %s\r\n", strerror(errno)); + } else { + if ( -1 == c3_sync(txt_u->fid_i) ) { + u3l_log("book: failed to sync header: %s\r\n", strerror(errno)); + } + } } else { txt_u->las_d = las_d; } @@ -677,8 +699,9 @@ u3_book_init(const c3_c* pax_c) if ( epo_d ) { txt_u->hed_u.fir_d = epo_d; + txt_u->hed_u.las_d = epo_d; - // persist fir_d (no need if epo_d is 0) + // persist fir_d and las_d (no need if epo_d is 0) if ( sizeof(c3_d) != pwrite(fid_i, &txt_u->hed_u.fir_d, sizeof(c3_d), offsetof(u3_book_head, fir_d)) ) { @@ -686,8 +709,15 @@ u3_book_init(const c3_c* pax_c) goto fail4; } + if ( sizeof(c3_d) != pwrite(fid_i, &txt_u->hed_u.las_d, + sizeof(c3_d), offsetof(u3_book_head, las_d)) ) + { + u3l_log("book: failed to write las_d: %s\r\n", strerror(errno)); + goto fail4; + } + if ( -1 == c3_sync(fid_i) ) { - u3l_log("book: failed to sync fir_d: %s\r\n", strerror(errno)); + u3l_log("book: failed to sync header: %s\r\n", strerror(errno)); goto fail4; } } @@ -943,12 +973,10 @@ u3_book_save(u3_book* txt_u, memcpy(&red_u.mug_l, buf_y, 4); red_u.jam_y = buf_y + 4; red_u.len_d = siz_d; - red_u.eve_d = eve_d + i_w; red_u.crc_w = _book_calc_crc(&red_u); // populate deed_head hed_u[i_w].len_d = siz_d; - hed_u[i_w].eve_d = eve_d + i_w; hed_u[i_w].mug_l = red_u.mug_l; // populate deed_tail @@ -1003,15 +1031,32 @@ u3_book_save(u3_book* txt_u, c3_free(tal_u); c3_free(iov_u); - // sync data to disk + // sync deed data to disk if ( -1 == c3_sync(txt_u->fid_i) ) { fprintf(stderr, "book: failed to sync events: %s\r\n", strerror(errno)); return c3n; } + // update header's las_d to signal successful commit + c3_d new_las_d = eve_d + len_d - 1; + txt_u->hed_u.las_d = new_las_d; + + if ( sizeof(c3_d) != pwrite(txt_u->fid_i, &txt_u->hed_u.las_d, + sizeof(c3_d), offsetof(u3_book_head, las_d)) ) + { + fprintf(stderr, "book: failed to write las_d: %s\r\n", strerror(errno)); + return c3n; + } + + // sync header to finalize commit + if ( -1 == c3_sync(txt_u->fid_i) ) { + fprintf(stderr, "book: failed to sync las_d: %s\r\n", strerror(errno)); + return c3n; + } + // update cache - txt_u->las_d = eve_d + len_d - 1; + txt_u->las_d = new_las_d; txt_u->off_d = now_d; return c3y; diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index 443ec81ef1..e7e98e9ada 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -7,12 +7,16 @@ /* book: append-only event log */ - /* u3_book_head: on-disk file header (16 bytes, immutable) + /* u3_book_head: on-disk file header (24 bytes) + ** + ** fir_d is write-once (set on first event save). + ** las_d is updated after each batch of events is committed. */ typedef struct _u3_book_head { c3_w mag_w; // magic number: 0x424f4f4b ("BOOK") - c3_w ver_w; // format version: 1 + c3_w ver_w; // format version: 2 c3_d fir_d; // first event number in file + c3_d las_d; // last event number (commit marker) } u3_book_head; /* u3_book_meta: on-disk metadata format (fixed 256 bytes) @@ -55,11 +59,10 @@ c3_o liv_o; // iterator valid } u3_book_walk; - /* u3_book_deed_head: on-disk deed header + /* u3_book_deed_head: on-disk deed header (12 bytes) */ typedef struct _u3_book_deed_head { c3_d len_d; // payload size (mug + jam) - c3_d eve_d; // event number c3_l mug_l; // mug/hash } u3_book_deed_head; @@ -85,7 +88,6 @@ */ typedef struct _u3_book_reed { c3_d len_d; // total payload size - c3_d eve_d; // event number c3_l mug_l; // mug/hash c3_y* jam_y; // jam data (caller owns, len = len_d - 4) c3_w crc_w; // CRC32 checksum From b636f7e752882236753618602cb2113fc87d86da Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 2 Feb 2026 13:51:47 -0500 Subject: [PATCH 30/38] book: removes per-event checksums --- pkg/vere/book_tests.c | 7 +++---- pkg/vere/db/book.c | 46 +++++-------------------------------------- pkg/vere/db/book.h | 2 -- 3 files changed, 8 insertions(+), 47 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index c9747167df..37a3a6454b 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -555,7 +555,7 @@ _test_truncated_file_recovery(void) } // write two events (each evt_z = 12 bytes: 4 mug + 8 jam) - // deed size on disk = 12 (head) + 8 (jam) + 12 (tail) = 32 bytes + // deed size on disk = 12 (head) + 8 (jam) + 8 (tail) = 28 bytes evt1_y = _test_make_event(&evt_z, 1); evt2_y = _test_make_event(&evt_z, 2); { @@ -573,9 +573,8 @@ _test_truncated_file_recovery(void) u3_book_exit(txt_u); txt_u = 0; - // file layout: [header 16] [deed1] [deed2] + // file layout: [header] [deed1] [deed2] // deed size = sizeof(deed_head) + (len_d - 4) + sizeof(deed_tail) - // with struct padding, this is typically 40 bytes per deed for our 12-byte events snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); c3_d siz_d = _test_file_size(path_c); @@ -1479,7 +1478,7 @@ main(int argc, char* argv[]) // benchmarks ret_i &= _bench_write_speed(10000, 128); - ret_i &= _bench_write_speed_batched(10000, 1280, 100); + ret_i &= _bench_write_speed_batched(1000000, 1280, 1000); fprintf(stderr, "\r\n"); if ( ret_i ) { diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 51a4a968c3..6f29109706 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -9,7 +9,6 @@ #include #include #include -#include #include "c3/c3.h" #include "noun.h" @@ -21,8 +20,8 @@ // optimized for sequential writes and reads; no random access. // // file format: -// [16-byte header (immutable)] -// [events: len_d | mug_l | jam_data | crc_m | let_d] +// [24-byte header] +// [events: len_d | mug_l | jam_data | let_d] // // metadata stored in separate meta.bin file // @@ -32,15 +31,6 @@ #define BOOK_MAGIC 0x424f4f4b // "BOOK" #define BOOK_VERSION 1 // format version -/* _book_crc32_two(): compute CRC32 over two buffers. -*/ -static c3_w -_book_crc32_two(c3_y* one_y, c3_w one_w, c3_y* two_y, c3_w two_w) -{ - c3_w crc_w = (c3_w)crc32(0L, one_y, one_w); - return (c3_w)crc32(crc_w, two_y, two_w); -} - /* _book_meta_path(): construct path to meta.bin from book directory path. ** ** pax_c should be a directory path (the one passed to u3_book_init) @@ -225,18 +215,6 @@ _book_deed_size(c3_d len_d) // = 12 + (len_d - 4) + 12 = len_d + 20 } -/* _book_calc_crc(): compute CRC32 for reed. -*/ -static c3_w -_book_calc_crc(const u3_book_reed* red_u) -{ - c3_y buf_y[12]; // 8 bytes len_d + 4 bytes mug - memcpy(buf_y, &red_u->len_d, 8); - memcpy(buf_y + 8, &red_u->mug_l, 4); - - return _book_crc32_two(buf_y, 12, red_u->jam_y, red_u->len_d - 4); -} - /* _book_okay_reed(): validate reed integrity. */ static c3_o @@ -247,12 +225,6 @@ _book_okay_reed(const u3_book_reed* red_u) return c3n; } - // validate CRC - c3_w crc_w = _book_calc_crc(red_u); - if ( crc_w != red_u->crc_w ) { - return c3n; - } - return c3y; } @@ -330,7 +302,6 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) now_d += sizeof(u3_book_deed_tail); // populate reed from tail - red_u->crc_w = tal_u.crc_w; let_d = tal_u.let_d; // validate len_d == let_d @@ -365,7 +336,6 @@ _book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) // prepare deed_tail u3_book_deed_tail tal_u; - tal_u.crc_w = red_u->crc_w; tal_u.let_d = red_u->len_d; // build iovec for scatter-gather write: head + jam + tail @@ -968,19 +938,13 @@ u3_book_save(u3_book* txt_u, return c3n; } - // build reed for CRC calculation - u3_book_reed red_u; - memcpy(&red_u.mug_l, buf_y, 4); - red_u.jam_y = buf_y + 4; - red_u.len_d = siz_d; - red_u.crc_w = _book_calc_crc(&red_u); - // populate deed_head + c3_l mug_l; + memcpy(&mug_l, buf_y, 4); hed_u[i_w].len_d = siz_d; - hed_u[i_w].mug_l = red_u.mug_l; + hed_u[i_w].mug_l = mug_l; // populate deed_tail - tal_u[i_w].crc_w = red_u.crc_w; tal_u[i_w].let_d = siz_d; } diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index e7e98e9ada..d372920eeb 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -69,7 +69,6 @@ /* u3_book_deed_tail: on-disk deed trailer */ typedef struct _u3_book_deed_tail { - c3_w crc_w; // CRC32 checksum c3_d let_d; // length trailer (validates len_d) } u3_book_deed_tail; @@ -90,7 +89,6 @@ c3_d len_d; // total payload size c3_l mug_l; // mug/hash c3_y* jam_y; // jam data (caller owns, len = len_d - 4) - c3_w crc_w; // CRC32 checksum } u3_book_reed; /* u3_book_init(): open/create event log at [pax_c]. From 02bcb3bf8eb6c6a3fc3e5f3c3b73bbd517ffdeb9 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 2 Feb 2026 15:01:40 -0500 Subject: [PATCH 31/38] book: implements lmdb-style double-buffering --- pkg/vere/book_tests.c | 42 ++-- pkg/vere/db/book.c | 509 ++++++++++++++++++++++++++---------------- pkg/vere/db/book.h | 54 ++--- pkg/vere/lmdb_tests.c | 8 +- 4 files changed, 381 insertions(+), 232 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index 37a3a6454b..ab93076456 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -12,6 +12,9 @@ #define _alloc(sz) malloc(sz) #define _free(ptr) free(ptr) +// book format v2: header area size (two 4096-byte header slots) +#define BOOK_DEED_BASE 8192 + /* _test_make_tmpdir(): create unique temporary directory with epoch subdir. ** ** creates /tmp/book_test_XXXXXX/0i0 and returns the epoch path. @@ -272,12 +275,13 @@ _test_single_event_lifecycle(void) } // verify gulf + // NB: fir_d is the epoch base (0), las_d is the last stored event (1) { c3_d low_d, hig_d; u3_book_gulf(txt_u, &low_d, &hig_d); - if ( 1 != low_d || 1 != hig_d ) { - fprintf(stderr, " single_event: gulf expected (1,1), got (%" PRIu64 ",%" PRIu64 ")\r\n", + if ( 0 != low_d || 1 != hig_d ) { + fprintf(stderr, " single_event: gulf expected (0,1), got (%" PRIu64 ",%" PRIu64 ")\r\n", low_d, hig_d); ret_i = 0; } @@ -573,16 +577,16 @@ _test_truncated_file_recovery(void) u3_book_exit(txt_u); txt_u = 0; - // file layout: [header] [deed1] [deed2] + // file layout: [header A @ 0] [header B @ 4096] [deed1 @ 8192] [deed2] // deed size = sizeof(deed_head) + (len_d - 4) + sizeof(deed_tail) snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); c3_d siz_d = _test_file_size(path_c); - // calculate deed size dynamically: total - header = 2 deeds - c3_d deed_size = (siz_d - 16) / 2; + // calculate deed size dynamically: total - headers = 2 deeds + c3_d deed_size = (siz_d - BOOK_DEED_BASE) / 2; - // truncate to: header + deed1 + 5 bytes of deed2 - c3_d truncate_at = 16 + deed_size + 5; + // truncate to: headers + deed1 + 5 bytes of deed2 + c3_d truncate_at = BOOK_DEED_BASE + deed_size + 5; if ( c3n == _test_truncate_file(path_c, truncate_at) ) { fprintf(stderr, " truncated_file: truncate failed\r\n"); @@ -870,11 +874,16 @@ _test_invalid_magic(void) u3_book_exit(txt_u); txt_u = 0; - // corrupt magic number at offset 0 + // corrupt magic number in BOTH header slots (double-buffered) snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); c3_w bad_magic = 0xDEADBEEF; if ( c3n == _test_write_raw(path_c, 0, &bad_magic, sizeof(bad_magic)) ) { - fprintf(stderr, " invalid_magic: write_raw failed\r\n"); + fprintf(stderr, " invalid_magic: write_raw A failed\r\n"); + ret_i = 0; + goto cleanup; + } + if ( c3n == _test_write_raw(path_c, 4096, &bad_magic, sizeof(bad_magic)) ) { + fprintf(stderr, " invalid_magic: write_raw B failed\r\n"); ret_i = 0; goto cleanup; } @@ -918,11 +927,16 @@ _test_invalid_version(void) u3_book_exit(txt_u); txt_u = 0; - // corrupt version at offset 4 + // corrupt version in BOTH header slots (double-buffered) snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); c3_w bad_version = 99; if ( c3n == _test_write_raw(path_c, 4, &bad_version, sizeof(bad_version)) ) { - fprintf(stderr, " invalid_version: write_raw failed\r\n"); + fprintf(stderr, " invalid_version: write_raw A failed\r\n"); + ret_i = 0; + goto cleanup; + } + if ( c3n == _test_write_raw(path_c, 4096 + 4, &bad_version, sizeof(bad_version)) ) { + fprintf(stderr, " invalid_version: write_raw B failed\r\n"); ret_i = 0; goto cleanup; } @@ -966,7 +980,7 @@ _test_undersized_file(void) u3_book_exit(txt_u); txt_u = 0; - // truncate to 8 bytes (less than 16-byte header) + // truncate to 8 bytes (less than 8192-byte header area) snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); if ( c3n == _test_truncate_file(path_c, 8) ) { fprintf(stderr, " undersized: truncate failed\r\n"); @@ -1477,8 +1491,8 @@ main(int argc, char* argv[]) ret_i &= _test_metadata_size_validation(); // benchmarks - ret_i &= _bench_write_speed(10000, 128); - ret_i &= _bench_write_speed_batched(1000000, 1280, 1000); + ret_i &= _bench_write_speed(1000, 128); + ret_i &= _bench_write_speed_batched(100000, 1280, 1000); fprintf(stderr, "\r\n"); if ( ret_i ) { diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 6f29109706..f2ce5d17ba 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -9,19 +9,20 @@ #include #include #include +#include #include "c3/c3.h" #include "noun.h" #include "ship.h" -// book: append-only event log +// book: mostly append-only event log // // simple file-based persistence layer for urbit's event log. // optimized for sequential writes and reads; no random access. // // file format: // [24-byte header] -// [events: len_d | mug_l | jam_data | let_d] +// [events: len_d | buffer_data | let_d] // // metadata stored in separate meta.bin file // @@ -31,6 +32,46 @@ #define BOOK_MAGIC 0x424f4f4b // "BOOK" #define BOOK_VERSION 1 // format version + // header slot offsets (page-aligned for atomic writes) + #define BOOK_HEAD_A 0 // first header slot + #define BOOK_HEAD_B 4096 // second header slot + #define BOOK_DEED_BASE 8192 // deeds start here + +/* _book_head_crc(): compute header CRC32. +** +** computes CRC32 over all fields except crc_w. +*/ +static c3_l +_book_head_crc(const u3_book_head* hed_u) +{ + // checksum covers: mag_w, ver_w, fir_d, las_d, seq_d (28 bytes) + c3_z len_z = offsetof(u3_book_head, crc_w); + return (c3_l)crc32(0, (const c3_y*)hed_u, len_z); +} + +/* _book_head_okay(): validate header CRC and magic. +** +** returns: c3y if header is valid, c3n otherwise +*/ +static c3_o +_book_head_okay(const u3_book_head* hed_u) +{ + if ( BOOK_MAGIC != hed_u->mag_w ) { + return c3n; + } + + if ( BOOK_VERSION != hed_u->ver_w ) { + return c3n; + } + + c3_w crc_w = _book_head_crc(hed_u); + if ( crc_w != hed_u->crc_w ) { + return c3n; + } + + return c3y; +} + /* _book_meta_path(): construct path to meta.bin from book directory path. ** ** pax_c should be a directory path (the one passed to u3_book_init) @@ -135,9 +176,10 @@ _book_save_meta_file(c3_i met_i, const u3_book_meta* met_u) return c3y; } -/* _book_make_head(): initialize and write header for new file. +/* _book_make_head(): initialize and write both header slots for new file. ** ** fir_d and las_d start at 0, updated when first events are saved. +** both header slots are initialized identically with seq_d = 0. */ static c3_o _book_make_head(u3_book* txt_u) @@ -150,27 +192,49 @@ _book_make_head(u3_book* txt_u) txt_u->hed_u.ver_w = BOOK_VERSION; txt_u->hed_u.fir_d = 0; txt_u->hed_u.las_d = 0; + txt_u->hed_u.seq_d = 0; + txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - // write header + // write header slot A ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, - sizeof(u3_book_head), 0); + sizeof(u3_book_head), BOOK_HEAD_A); if ( ret_zs != sizeof(u3_book_head) ) { - u3l_log("book: failed to write header: %s\r\n", + u3l_log("book: failed to write header A: %s\r\n", + strerror(errno)); + return c3n; + } + + // write header slot B (identical initially) + ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), BOOK_HEAD_B); + + if ( ret_zs != sizeof(u3_book_head) ) { + u3l_log("book: failed to write header B: %s\r\n", + strerror(errno)); + return c3n; + } + + // extend file to BOOK_DEED_BASE so it passes minimum size check on reopen + if ( -1 == ftruncate(txt_u->fid_i, BOOK_DEED_BASE) ) { + u3l_log("book: failed to extend file: %s\r\n", strerror(errno)); return c3n; } if ( -1 == c3_sync(txt_u->fid_i) ) { - u3l_log("book: failed to sync header: %s\r\n", + u3l_log("book: failed to sync headers: %s\r\n", strerror(errno)); return c3n; } + // start with slot A as active + txt_u->act_w = 0; + return c3y; } -/* _book_okay_head(): validate header fields. +/* _book_okay_head(): validate header fields (verbose version). ** ** returns: c3y if valid, c3n otherwise (prints error message) */ @@ -187,23 +251,78 @@ _book_okay_head(const u3_book_head* hed_u) return c3n; } + c3_w crc_w = _book_head_crc(hed_u); + if ( crc_w != hed_u->crc_w ) { + fprintf(stderr, "book: header checksum mismatch: 0x%08x != 0x%08x\r\n", + crc_w, hed_u->crc_w); + return c3n; + } + return c3y; } -/* _book_read_head(): read and validate header. +/* _book_read_head(): read both header slots and select valid one. +** +** reads both header slots, validates checksums, and selects the one +** with the higher sequence number. this implements the LMDB-style +** double-buffered commit protocol. +** +** on success, txt_u->hed_u contains the valid header and txt_u->act_w +** is set to the active slot index (0 or 1). */ static c3_o _book_read_head(u3_book* txt_u) { - c3_zs ret_zs = pread(txt_u->fid_i, &txt_u->hed_u, - sizeof(u3_book_head), 0); + u3_book_head hed_a, hed_b; + c3_o val_a, val_b; + c3_zs ret_zs; + + // read header slot A + ret_zs = pread(txt_u->fid_i, &hed_a, sizeof(u3_book_head), BOOK_HEAD_A); + if ( ret_zs != sizeof(u3_book_head) ) { + fprintf(stderr, "book: failed to read header A\r\n"); + val_a = c3n; + } + else { + val_a = _book_head_okay(&hed_a); + } + // read header slot B + ret_zs = pread(txt_u->fid_i, &hed_b, sizeof(u3_book_head), BOOK_HEAD_B); if ( ret_zs != sizeof(u3_book_head) ) { - fprintf(stderr, "book: failed to read header\r\n"); + fprintf(stderr, "book: failed to read header B\r\n"); + val_b = c3n; + } + else { + val_b = _book_head_okay(&hed_b); + } + + // select valid header with highest sequence number + if ( c3y == val_a && c3y == val_b ) { + // both valid: use higher sequence number + if ( hed_a.seq_d >= hed_b.seq_d ) { + txt_u->hed_u = hed_a; + txt_u->act_w = 0; + } + else { + txt_u->hed_u = hed_b; + txt_u->act_w = 1; + } + } + else if ( c3y == val_a ) { + txt_u->hed_u = hed_a; + txt_u->act_w = 0; + } + else if ( c3y == val_b ) { + txt_u->hed_u = hed_b; + txt_u->act_w = 1; + } + else { + fprintf(stderr, "book: no valid header found\r\n"); return c3n; } - return _book_okay_head(&txt_u->hed_u); + return c3y; } /* _book_deed_size(): calculate total on-disk size of deed. @@ -211,8 +330,9 @@ _book_read_head(u3_book* txt_u) static inline c3_w _book_deed_size(c3_d len_d) { - return sizeof(u3_book_deed_head) + (len_d - 4) + sizeof(u3_book_deed_tail); - // = 12 + (len_d - 4) + 12 = len_d + 20 + // format: len_d (8) + buffer_data (len_d) + let_d (8) + // = 8 + len_d + 8 = len_d + 16 + return sizeof(c3_d) + len_d + sizeof(c3_d); } /* _book_okay_reed(): validate reed integrity. @@ -228,10 +348,10 @@ _book_okay_reed(const u3_book_reed* red_u) return c3y; } -/* _book_reed_to_buff(): convert reed to mug+jam buffer format. +/* _book_reed_to_buff(): convert reed to byte buffer. ** ** allocates output buffer; caller must free. -** frees red_u->jam_y on success; caller must free on failure. +** frees red_u->buf_y on success; caller must free on failure. ** ** returns: allocated buffer, or 0 on allocation failure */ @@ -245,9 +365,8 @@ _book_reed_to_buff(u3_book_reed* red_u, c3_z* len_z) return 0; } - memcpy(buf_y, &red_u->mug_l, 4); - memcpy(buf_y + 4, red_u->jam_y, red_u->len_d - 4); - c3_free(red_u->jam_y); + memcpy(buf_y, red_u->buf_y, red_u->len_d); + c3_free(red_u->buf_y); return buf_y; } @@ -255,10 +374,10 @@ _book_reed_to_buff(u3_book_reed* red_u, c3_z* len_z) /* _book_read_deed(): read deed from file into [red_u]. ** ** returns: -** c3y: success, jam_y allocated +** c3y: success, buf_y allocated with complete buffer ** c3n: failure (EOF or corruption) ** -** on success, caller must free red_u->jam_y +** on success, caller must free red_u->buf_y */ static c3_o _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) @@ -267,49 +386,47 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) c3_d now_d = *off_d; c3_d let_d; - // read deed_head - u3_book_deed_head hed_u; - ret_zs = pread(fid_i, &hed_u, sizeof(u3_book_deed_head), now_d); - if ( ret_zs != sizeof(u3_book_deed_head) ) { + // read deed head (len_d) + c3_d len_d; + ret_zs = pread(fid_i, &len_d, sizeof(c3_d), now_d); + if ( ret_zs != sizeof(c3_d) ) { return c3n; } - now_d += sizeof(u3_book_deed_head); + now_d += sizeof(c3_d); - // populate reed from head - red_u->len_d = hed_u.len_d; - red_u->mug_l = hed_u.mug_l; - - // read jam data (len_d - mug bytes) - c3_d jaz_d = red_u->len_d - 4; - red_u->jam_y = c3_malloc(jaz_d); - if ( !red_u->jam_y ) { + // read complete buffer data + red_u->buf_y = c3_malloc(len_d); + if ( !red_u->buf_y ) { return c3n; } - ret_zs = pread(fid_i, red_u->jam_y, jaz_d, now_d); - if ( ret_zs != (c3_zs)jaz_d ) { - c3_free(red_u->jam_y); + ret_zs = pread(fid_i, red_u->buf_y, len_d, now_d); + if ( ret_zs != (c3_zs)len_d ) { + c3_free(red_u->buf_y); return c3n; } - now_d += jaz_d; + now_d += len_d; - // read deed_tail - u3_book_deed_tail tal_u; - ret_zs = pread(fid_i, &tal_u, sizeof(u3_book_deed_tail), now_d); - if ( ret_zs != sizeof(u3_book_deed_tail) ) { - c3_free(red_u->jam_y); + // read deed tail (let_d validation field) + c3_d let_d_read; + ret_zs = pread(fid_i, &let_d_read, sizeof(c3_d), now_d); + if ( ret_zs != sizeof(c3_d) ) { + c3_free(red_u->buf_y); return c3n; } - now_d += sizeof(u3_book_deed_tail); + now_d += sizeof(c3_d); - // populate reed from tail - let_d = tal_u.let_d; + // validate + let_d = let_d_read; // validate len_d == let_d - if ( red_u->len_d != let_d ) { - c3_free(red_u->jam_y); + if ( len_d != let_d ) { + c3_free(red_u->buf_y); return c3n; } + // populate reed + red_u->len_d = len_d; + // update offset *off_d = now_d; @@ -327,27 +444,21 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) static c3_o _book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) { - c3_d jaz_d = red_u->len_d - 4; // len_d - mug bytes + c3_d len_d = red_u->len_d; // complete buffer size - // prepare deed_head - u3_book_deed_head hed_u; - hed_u.len_d = red_u->len_d; - hed_u.mug_l = red_u->mug_l; + // prepare deed tail (validation field) + c3_d let_d = len_d; - // prepare deed_tail - u3_book_deed_tail tal_u; - tal_u.let_d = red_u->len_d; - - // build iovec for scatter-gather write: head + jam + tail + // build iovec for scatter-gather write: len_d + buffer + let_d struct iovec iov_u[3]; - iov_u[0].iov_base = &hed_u; - iov_u[0].iov_len = sizeof(u3_book_deed_head); - iov_u[1].iov_base = red_u->jam_y; - iov_u[1].iov_len = jaz_d; - iov_u[2].iov_base = &tal_u; - iov_u[2].iov_len = sizeof(u3_book_deed_tail); - - c3_z tot_z = sizeof(u3_book_deed_head) + jaz_d + sizeof(u3_book_deed_tail); + iov_u[0].iov_base = &len_d; + iov_u[0].iov_len = sizeof(c3_d); + iov_u[1].iov_base = red_u->buf_y; + iov_u[1].iov_len = len_d; + iov_u[2].iov_base = &let_d; + iov_u[2].iov_len = sizeof(c3_d); + + c3_z tot_z = sizeof(c3_d) + len_d + sizeof(c3_d); c3_zs ret_zs = pwritev(fid_i, iov_u, 3, *off_d); if ( ret_zs != (c3_zs)tot_z ) { @@ -407,15 +518,15 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) // get file size if ( -1 == fstat(txt_u->fid_i, &buf_u) ) { - *off_d = sizeof(u3_book_head); + *off_d = BOOK_DEED_BASE; return c3n; } end_d = (c3_d)buf_u.st_size; // empty or header-only file is valid (no deeds yet) - if ( end_d <= sizeof(u3_book_head) ) { - *off_d = sizeof(u3_book_head); + if ( end_d <= BOOK_DEED_BASE ) { + *off_d = BOOK_DEED_BASE; txt_u->las_d = txt_u->hed_u.las_d; return c3y; } @@ -423,21 +534,22 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) // if header says no events, but file has data beyond header, // that's uncommitted data - fall back to forward scan if ( 0 == txt_u->hed_u.las_d ) { - *off_d = sizeof(u3_book_head); + *off_d = BOOK_DEED_BASE; return c3n; } pos_d = end_d; // scan backwards to validate last deed - while ( pos_d > sizeof(u3_book_head) ) { + while ( pos_d > BOOK_DEED_BASE ) { c3_zs ret_zs; c3_d let_d; c3_d siz_d; c3_d ded_d; // deed start offset + c3_d min_size = sizeof(u3_book_deed) + sizeof(c3_d); // minimum deed size // need at least deed_tail size to read let_d - if ( pos_d < sizeof(u3_book_head) + sizeof(u3_book_deed_tail) ) { + if ( pos_d < BOOK_DEED_BASE + min_size ) { break; } @@ -450,7 +562,7 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) // calculate deed size and start position siz_d = _book_deed_size(let_d); - if ( siz_d > pos_d - sizeof(u3_book_head) ) { + if ( siz_d > pos_d - BOOK_DEED_BASE ) { // deed would extend before header break; } @@ -467,12 +579,12 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) } if ( c3n == _book_okay_reed(&red_u) ) { - c3_free(red_u.jam_y); + c3_free(red_u.buf_y); break; } // deed is valid — use header's las_d as authoritative - c3_free(red_u.jam_y); + c3_free(red_u.buf_y); *off_d = pos_d; txt_u->las_d = txt_u->hed_u.las_d; return c3y; @@ -480,7 +592,7 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) } // no valid deeds found - *off_d = sizeof(u3_book_head); + *off_d = BOOK_DEED_BASE; return c3n; } @@ -488,13 +600,13 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) ** ** used as fallback when _book_scan_back fails (corruption recovery). ** validates each record's CRC and len_d == let_d sequentially. -** if corruption is found, truncates file and updates header's las_d. +** if corruption is found, truncates file and updates header. ** ** on completion: ** - sets *off_d to append offset ** - sets txt_u->las_d to last valid event number ** - truncates file if corrupted trailing data was found -** - updates header's las_d if recovery changed the count +** - updates header if recovery changed the count ** ** returns: ** c3y: always (recovery is best-effort) @@ -502,7 +614,7 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) static c3_o _book_scan_fore(u3_book* txt_u, c3_d* off_d) { - c3_d cur_d = sizeof(u3_book_head); // start of events + c3_d cur_d = BOOK_DEED_BASE; // start of events c3_d cot_d = 0; // count of valid deeds found c3_d las_d = 0; // last valid event number found c3_d exp_d; // expected event count from header @@ -515,8 +627,9 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) } // expected count based on header's las_d - exp_d = ( txt_u->hed_u.las_d >= txt_u->hed_u.fir_d ) - ? txt_u->hed_u.las_d - txt_u->hed_u.fir_d + 1 + // NB: fir_d is the epoch base; events are fir_d+1 through las_d + exp_d = ( txt_u->hed_u.las_d > txt_u->hed_u.fir_d ) + ? txt_u->hed_u.las_d - txt_u->hed_u.fir_d : 0; while ( 1 ) { @@ -532,13 +645,14 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) // validate reed (CRC and length checks) if ( c3n == _book_okay_reed(&red_u) ) { u3l_log("book: validation failed at offset %" PRIu64 "\r\n", beg_d); - c3_free(red_u.jam_y); + c3_free(red_u.buf_y); break; } // deed is valid - calculate its event number - las_d = txt_u->hed_u.fir_d + cot_d; - c3_free(red_u.jam_y); + // NB: first deed is event fir_d + 1 + las_d = txt_u->hed_u.fir_d + 1 + cot_d; + c3_free(red_u.buf_y); cot_d++; } @@ -551,7 +665,7 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) if ( 0 == cot_d ) { txt_u->las_d = 0; las_d = 0; - cur_d = sizeof(u3_book_head); + cur_d = BOOK_DEED_BASE; } else { txt_u->las_d = las_d; } @@ -567,16 +681,21 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) } } - // update header's las_d to match recovered state + // update header to match recovered state (write to inactive slot) txt_u->hed_u.las_d = las_d; - if ( sizeof(c3_d) != pwrite(txt_u->fid_i, &txt_u->hed_u.las_d, - sizeof(c3_d), offsetof(u3_book_head, las_d)) ) + txt_u->hed_u.seq_d++; + txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); + + c3_d slot_d = (txt_u->act_w == 0) ? BOOK_HEAD_B : BOOK_HEAD_A; + if ( sizeof(u3_book_head) != pwrite(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), slot_d) ) { - u3l_log("book: failed to update header las_d: %s\r\n", strerror(errno)); + u3l_log("book: failed to update header: %s\r\n", strerror(errno)); } else { if ( -1 == c3_sync(txt_u->fid_i) ) { u3l_log("book: failed to sync header: %s\r\n", strerror(errno)); } + txt_u->act_w = (txt_u->act_w == 0) ? 1 : 0; } } else { txt_u->las_d = las_d; @@ -659,7 +778,9 @@ u3_book_init(const c3_c* pax_c) if ( buf_u.st_size == 0 ) { // new file: initialize and write header - _book_make_head(txt_u); + if ( c3n == _book_make_head(txt_u) ) { + goto fail4; + } // extract epoch number from path c3_d epo_d; @@ -668,21 +789,24 @@ u3_book_init(const c3_c* pax_c) } if ( epo_d ) { + // update header with epoch info and rewrite both slots txt_u->hed_u.fir_d = epo_d; txt_u->hed_u.las_d = epo_d; + txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - // persist fir_d and las_d (no need if epo_d is 0) - if ( sizeof(c3_d) != pwrite(fid_i, &txt_u->hed_u.fir_d, - sizeof(c3_d), offsetof(u3_book_head, fir_d)) ) + // write header slot A + if ( sizeof(u3_book_head) != pwrite(fid_i, &txt_u->hed_u, + sizeof(u3_book_head), BOOK_HEAD_A) ) { - u3l_log("book: failed to write fir_d: %s\r\n", strerror(errno)); + u3l_log("book: failed to write header A: %s\r\n", strerror(errno)); goto fail4; } - if ( sizeof(c3_d) != pwrite(fid_i, &txt_u->hed_u.las_d, - sizeof(c3_d), offsetof(u3_book_head, las_d)) ) + // write header slot B + if ( sizeof(u3_book_head) != pwrite(fid_i, &txt_u->hed_u, + sizeof(u3_book_head), BOOK_HEAD_B) ) { - u3l_log("book: failed to write las_d: %s\r\n", strerror(errno)); + u3l_log("book: failed to write header B: %s\r\n", strerror(errno)); goto fail4; } @@ -693,10 +817,10 @@ u3_book_init(const c3_c* pax_c) } txt_u->las_d = epo_d; - txt_u->off_d = sizeof(u3_book_head); + txt_u->off_d = BOOK_DEED_BASE; } - else if ( buf_u.st_size < (off_t)sizeof(u3_book_head) ) { - // corrupt file: too small + else if ( buf_u.st_size < (off_t)BOOK_DEED_BASE ) { + // corrupt file: too small for headers u3l_log("book: file too small: %lld bytes\r\n", (long long)buf_u.st_size); goto fail4; } @@ -774,7 +898,8 @@ void u3_book_stat(const c3_c* log_c) { c3_i fid_i; - u3_book_head hed_u; + u3_book_head hed_a, hed_b, hed_u; + c3_o val_a, val_b; struct stat buf_u; // open the file directly @@ -784,14 +909,25 @@ u3_book_stat(const c3_c* log_c) return; } - // read and validate header - if ( sizeof(u3_book_head) != read(fid_i, &hed_u, sizeof(u3_book_head)) ) { - fprintf(stderr, "book: failed to read header\r\n"); - close(fid_i); - return; - } + // read both header slots and pick valid one + c3_zs ret_zs; + ret_zs = pread(fid_i, &hed_a, sizeof(u3_book_head), BOOK_HEAD_A); + val_a = (ret_zs == sizeof(u3_book_head)) ? _book_head_okay(&hed_a) : c3n; - if ( c3n == _book_okay_head(&hed_u) ) { + ret_zs = pread(fid_i, &hed_b, sizeof(u3_book_head), BOOK_HEAD_B); + val_b = (ret_zs == sizeof(u3_book_head)) ? _book_head_okay(&hed_b) : c3n; + + if ( c3y == val_a && c3y == val_b ) { + hed_u = (hed_a.seq_d >= hed_b.seq_d) ? hed_a : hed_b; + } + else if ( c3y == val_a ) { + hed_u = hed_a; + } + else if ( c3y == val_b ) { + hed_u = hed_b; + } + else { + fprintf(stderr, "book: no valid header found\r\n"); close(fid_i); return; } @@ -806,6 +942,8 @@ u3_book_stat(const c3_c* log_c) fprintf(stderr, " file: %s\r\n", log_c); fprintf(stderr, " format: %u\r\n", hed_u.ver_w); fprintf(stderr, " first event: %" PRIu64 "\r\n", hed_u.fir_d); + fprintf(stderr, " last event: %" PRIu64 "\r\n", hed_u.las_d); + fprintf(stderr, " sequence: %" PRIu64 "\r\n", hed_u.seq_d); fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); // read metadata from meta.bin @@ -845,8 +983,13 @@ u3_book_stat(const c3_c* log_c) /* u3_book_save(): save [len_d] events starting at [eve_d]. ** -** byt_p: array of buffers (mug + jam) +** byt_p: array of buffers ** siz_i: array of buffer sizes +** +** uses double-buffered headers for single-fsync commits: +** 1. write deed data +** 2. write updated header to INACTIVE slot +** 3. single fsync makes both durable atomically */ c3_o u3_book_save(u3_book* txt_u, @@ -863,7 +1006,7 @@ u3_book_save(u3_book* txt_u, } // validate contiguity - if ( 0 == txt_u->hed_u.fir_d ) { + if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->las_d ) { // empty log: first event must be the first event in the epoch if ( epo_d + 1 != eve_d ) { fprintf(stderr, "book: first event must be start of epoch, " @@ -871,21 +1014,8 @@ u3_book_save(u3_book* txt_u, "\r\n", epo_d + 1, eve_d); return c3n; } - txt_u->hed_u.fir_d = eve_d; - - // persist fir_d (write-once) - if ( sizeof(c3_d) != pwrite(txt_u->fid_i, &txt_u->hed_u.fir_d, - sizeof(c3_d), offsetof(u3_book_head, fir_d)) ) - { - fprintf(stderr, "book: failed to write fir_d: %s\r\n", strerror(errno)); - return c3n; - } - - // sync fir_d before writing deeds to ensure header is durable - if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: failed to sync fir_d: %s\r\n", strerror(errno)); - return c3n; - } + // fir_d is the epoch base (last event before this epoch) + txt_u->hed_u.fir_d = epo_d; } else { // non-empty: must be contiguous @@ -898,7 +1028,7 @@ u3_book_save(u3_book* txt_u, // batch write all deeds using scatter-gather I/O // - // for each deed we need 3 iovec entries: head + jam + tail + // for each deed we need 3 iovec entries: len_d + buffer + let_d // pwritev has IOV_MAX limit (typically 1024), so we chunk if needed // now_d = txt_u->off_d; @@ -907,23 +1037,24 @@ u3_book_save(u3_book* txt_u, #define BOOK_IOV_MAX 1020 c3_w max_deeds_w = BOOK_IOV_MAX / 3; // 340 deeds per call - // allocate arrays for headers and tails (jam data comes from byt_p) - u3_book_deed_head* hed_u = c3_malloc(len_d * sizeof(u3_book_deed_head)); - u3_book_deed_tail* tal_u = c3_malloc(len_d * sizeof(u3_book_deed_tail)); + // allocate arrays for deed lengths and tails + c3_d* len_u = c3_malloc(len_d * sizeof(c3_d)); + c3_d* let_u = c3_malloc(len_d * sizeof(c3_d)); // iovec array sized for one chunk (reused for each pwritev call) + // each deed needs 3 iovecs: len_d + buffer + let_d c3_w iov_max_w = (len_d < max_deeds_w) ? len_d * 3 : BOOK_IOV_MAX; struct iovec* iov_u = c3_malloc(iov_max_w * sizeof(struct iovec)); - if ( !hed_u || !tal_u || !iov_u ) { - c3_free(hed_u); - c3_free(tal_u); + if ( !len_u || !let_u || !iov_u ) { + c3_free(len_u); + c3_free(let_u); c3_free(iov_u); fprintf(stderr, "book: failed to allocate batch write buffers\r\n"); return c3n; } - // first pass: populate headers and tails, calculate CRCs + // first pass: populate deed lengths and validation fields for ( c3_w i_w = 0; i_w < len_d; i_w++ ) { c3_y* buf_y = (c3_y*)byt_p[i_w]; c3_d siz_d = (c3_d)siz_i[i_w]; @@ -932,29 +1063,28 @@ u3_book_save(u3_book* txt_u, if ( siz_d < 4 ) { fprintf(stderr, "book: event %" PRIu64 " buffer too small: %" PRIu64 "\r\n", eve_d + i_w, siz_d); - c3_free(hed_u); - c3_free(tal_u); + c3_free(len_u); + c3_free(let_u); c3_free(iov_u); return c3n; } - // populate deed_head - c3_l mug_l; - memcpy(&mug_l, buf_y, 4); - hed_u[i_w].len_d = siz_d; - hed_u[i_w].mug_l = mug_l; + // populate deed fields + len_u[i_w] = siz_d; // complete buffer size - // populate deed_tail - tal_u[i_w].let_d = siz_d; + // populate deed tail validation field + let_u[i_w] = siz_d; } // second pass: write in chunks to respect IOV_MAX + // each deed now uses 3 iovecs: len_d + buffer + let_d + #define DEEDS_PER_CHUNK (BOOK_IOV_MAX / 3) // 340 deeds per call c3_w done_w = 0; // deeds written so far while ( done_w < len_d ) { c3_w chunk_w = len_d - done_w; - if ( chunk_w > max_deeds_w ) { - chunk_w = max_deeds_w; + if ( chunk_w > DEEDS_PER_CHUNK ) { + chunk_w = DEEDS_PER_CHUNK; } // build iovec for this chunk @@ -963,16 +1093,15 @@ u3_book_save(u3_book* txt_u, c3_w src_w = done_w + i_w; c3_w idx_w = i_w * 3; c3_y* buf_y = (c3_y*)byt_p[src_w]; - c3_d jaz_d = siz_i[src_w] - 4; - iov_u[idx_w + 0].iov_base = &hed_u[src_w]; - iov_u[idx_w + 0].iov_len = sizeof(u3_book_deed_head); - iov_u[idx_w + 1].iov_base = buf_y + 4; - iov_u[idx_w + 1].iov_len = jaz_d; - iov_u[idx_w + 2].iov_base = &tal_u[src_w]; - iov_u[idx_w + 2].iov_len = sizeof(u3_book_deed_tail); + iov_u[idx_w + 0].iov_base = &len_u[src_w]; + iov_u[idx_w + 0].iov_len = sizeof(c3_d); + iov_u[idx_w + 1].iov_base = buf_y; + iov_u[idx_w + 1].iov_len = siz_i[src_w]; + iov_u[idx_w + 2].iov_base = &let_u[src_w]; + iov_u[idx_w + 2].iov_len = sizeof(c3_d); - chunk_z += sizeof(u3_book_deed_head) + jaz_d + sizeof(u3_book_deed_tail); + chunk_z += sizeof(c3_d) + siz_i[src_w] + sizeof(c3_d); } // pwritev for this chunk @@ -981,8 +1110,8 @@ u3_book_save(u3_book* txt_u, if ( ret_zs != (c3_zs)chunk_z ) { fprintf(stderr, "book: batch write failed: wrote %zd of %zu bytes: %s\r\n", ret_zs, chunk_z, strerror(errno)); - c3_free(hed_u); - c3_free(tal_u); + c3_free(len_u); + c3_free(let_u); c3_free(iov_u); return c3n; } @@ -991,34 +1120,34 @@ u3_book_save(u3_book* txt_u, done_w += chunk_w; } - c3_free(hed_u); - c3_free(tal_u); + c3_free(len_u); + c3_free(let_u); c3_free(iov_u); - // sync deed data to disk - if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: failed to sync events: %s\r\n", - strerror(errno)); - return c3n; - } - - // update header's las_d to signal successful commit + // prepare new header for inactive slot c3_d new_las_d = eve_d + len_d - 1; txt_u->hed_u.las_d = new_las_d; + txt_u->hed_u.seq_d++; + txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - if ( sizeof(c3_d) != pwrite(txt_u->fid_i, &txt_u->hed_u.las_d, - sizeof(c3_d), offsetof(u3_book_head, las_d)) ) + // write header to INACTIVE slot (double-buffer protocol) + c3_d slot_d = (txt_u->act_w == 0) ? BOOK_HEAD_B : BOOK_HEAD_A; + if ( sizeof(u3_book_head) != pwrite(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), slot_d) ) { - fprintf(stderr, "book: failed to write las_d: %s\r\n", strerror(errno)); + fprintf(stderr, "book: failed to write header: %s\r\n", strerror(errno)); return c3n; } - // sync header to finalize commit + // SINGLE fsync: makes both deed data and new header durable atomically if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: failed to sync las_d: %s\r\n", strerror(errno)); + fprintf(stderr, "book: failed to sync: %s\r\n", strerror(errno)); return c3n; } + // commit successful: switch active slot + txt_u->act_w = (txt_u->act_w == 0) ? 1 : 0; + // update cache txt_u->las_d = new_las_d; txt_u->off_d = now_d; @@ -1031,8 +1160,8 @@ u3_book_save(u3_book* txt_u, ** invokes callback for each event with: ** ptr_v: context pointer ** eve_d: event number -** len_i: buffer size (mug + jam) -** buf_v: buffer pointer (mug + jam format) +** len_z: buffer size +** buf_v: buffer pointer */ c3_o u3_book_read(u3_book* txt_u, @@ -1055,8 +1184,9 @@ u3_book_read(u3_book* txt_u, return c3n; } - if ( eve_d < txt_u->hed_u.fir_d || eve_d > txt_u->las_d ) { - fprintf(stderr, "book: event %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", + // NB: fir_d is the epoch base; first stored event is fir_d + 1 + if ( eve_d <= txt_u->hed_u.fir_d || eve_d > txt_u->las_d ) { + fprintf(stderr, "book: event %" PRIu64 " out of range (%" PRIu64 ", %" PRIu64 "]\r\n", eve_d, txt_u->hed_u.fir_d, txt_u->las_d); return c3n; } @@ -1066,9 +1196,10 @@ u3_book_read(u3_book* txt_u, return c3n; } - // scan to starting event (events start after header) - off_d = sizeof(u3_book_head); - cur_d = txt_u->hed_u.fir_d; + // scan to starting event + // NB: fir_d is the epoch base; first deed is event fir_d + 1 + off_d = BOOK_DEED_BASE; + cur_d = txt_u->hed_u.fir_d + 1; while ( cur_d < eve_d ) { if ( c3n == _book_skip_deed(txt_u->fid_i, &off_d) ) { @@ -1093,14 +1224,14 @@ u3_book_read(u3_book* txt_u, // validate reed if ( c3n == _book_okay_reed(&red_u) ) { fprintf(stderr, "book: validation failed at event %" PRIu64 "\r\n", cur_d); - c3_free(red_u.jam_y); + c3_free(red_u.buf_y); return c3n; } - // convert to mug + jam format for callback + // convert to buffer format for callback buf_y = _book_reed_to_buff(&red_u, &len_z); if ( !buf_y ) { - c3_free(red_u.jam_y); + c3_free(red_u.buf_y); return c3n; } @@ -1236,8 +1367,9 @@ u3_book_walk_init(u3_book* txt_u, return c3n; } - if ( nex_d < txt_u->hed_u.fir_d || nex_d > txt_u->las_d ) { - fprintf(stderr, "book: walk_init start %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", + // NB: fir_d is the epoch base; first stored event is fir_d + 1 + if ( nex_d <= txt_u->hed_u.fir_d || nex_d > txt_u->las_d ) { + fprintf(stderr, "book: walk_init start %" PRIu64 " out of range (%" PRIu64 ", %" PRIu64 "]\r\n", nex_d, txt_u->hed_u.fir_d, txt_u->las_d); return c3n; } @@ -1248,9 +1380,10 @@ u3_book_walk_init(u3_book* txt_u, return c3n; } - // scan to starting event (events start after header) - off_d = sizeof(u3_book_head); - cur_d = txt_u->hed_u.fir_d; + // scan to starting event + // NB: fir_d is the epoch base; first deed is event fir_d + 1 + off_d = BOOK_DEED_BASE; + cur_d = txt_u->hed_u.fir_d + 1; while ( cur_d < nex_d ) { if ( c3n == _book_skip_deed(txt_u->fid_i, &off_d) ) { @@ -1303,15 +1436,15 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) if ( c3n == _book_okay_reed(&red_u) ) { fprintf(stderr, "book: walk_next validation failed at event %" PRIu64 "\r\n", itr_u->nex_d); - c3_free(red_u.jam_y); + c3_free(red_u.buf_y); itr_u->liv_o = c3n; return c3n; } - // convert to mug + jam format + // convert to buffer format buf_y = _book_reed_to_buff(&red_u, len_z); if ( !buf_y ) { - c3_free(red_u.jam_y); + c3_free(red_u.buf_y); itr_u->liv_o = c3n; return c3n; } diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index d372920eeb..c4284a4704 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -5,18 +5,27 @@ #include "c3/c3.h" - /* book: append-only event log + /* book: mostly append-only event log + ** + ** uses double-buffered headers for single-fsync commits (like LMDB). + ** two header slots alternate; the one with higher valid seq_d is current. */ - /* u3_book_head: on-disk file header (24 bytes) + /* u3_book_head: on-disk file header (32 bytes, page-aligned slots) ** ** fir_d is write-once (set on first event save). ** las_d is updated after each batch of events is committed. + ** seq_d is monotonically increasing; determines which slot is current. + ** crc_w is CRC32 of preceding fields to detect partial writes. + ** + ** two header slots at offsets 0 and 4096; deeds start at 8192. */ typedef struct _u3_book_head { c3_w mag_w; // magic number: 0x424f4f4b ("BOOK") - c3_w ver_w; // format version: 2 + c3_w ver_w; // format version: 1 c3_d fir_d; // first event number in file c3_d las_d; // last event number (commit marker) + c3_d seq_d; // sequence number (for double-buffer) + c3_w crc_w; // CRC32 checksum (of preceding fields) } u3_book_head; /* u3_book_meta: on-disk metadata format (fixed 256 bytes) @@ -44,9 +53,10 @@ c3_i fid_i; // file descriptor for book.log c3_i met_i; // file descriptor for meta.bin c3_c* pax_c; // file path to book.log - u3_book_head hed_u; // cached header (immutable) + u3_book_head hed_u; // cached header (current valid state) c3_d las_d; // cached last event number c3_d off_d; // cached append offset (end of last event) + c3_w act_w; // active header slot (0 or 1) } u3_book; /* u3_book_walk: event iterator @@ -59,36 +69,28 @@ c3_o liv_o; // iterator valid } u3_book_walk; - /* u3_book_deed_head: on-disk deed header (12 bytes) - */ - typedef struct _u3_book_deed_head { - c3_d len_d; // payload size (mug + jam) - c3_l mug_l; // mug/hash - } u3_book_deed_head; - - /* u3_book_deed_tail: on-disk deed trailer - */ - typedef struct _u3_book_deed_tail { - c3_d let_d; // length trailer (validates len_d) - } u3_book_deed_tail; - - /* u3_book_deed: complete on-disk event record + /* u3_book_deed: on-disk event record + ** + ** on-disk format: len_d | buffer_data | let_d + ** where buffer_data is len_d bytes of opaque buffer data + ** and let_d echoes len_d for validation (used for backward scanning) ** - ** NB: not used directly for I/O due to variable-length jam data - ** actual format: deed_head | jam_data | deed_tail + ** NB: not used directly for I/O due to variable-length buffer data */ typedef struct _u3_book_deed { - u3_book_deed_head hed_u; - // c3_y jam_y[]; // variable-length jam data - u3_book_deed_tail tal_u; + c3_d len_d; // buffer size (bytes) + // c3_y buf_y[]; // variable-length buffer data + c3_d let_d; // length trailer (echoes len_d, used for backward scanning) } u3_book_deed; /* u3_book_reed: in-memory event record representation for I/O + ** + ** represents a complete event buffer including any prefixes. + ** the book API treats buffers as opaque byte arrays. */ typedef struct _u3_book_reed { - c3_d len_d; // total payload size - c3_l mug_l; // mug/hash - c3_y* jam_y; // jam data (caller owns, len = len_d - 4) + c3_d len_d; // total buffer size (bytes) + c3_y* buf_y; // complete buffer (caller owns) } u3_book_reed; /* u3_book_init(): open/create event log at [pax_c]. diff --git a/pkg/vere/lmdb_tests.c b/pkg/vere/lmdb_tests.c index 37e4c96ce4..2ad65a6f16 100644 --- a/pkg/vere/lmdb_tests.c +++ b/pkg/vere/lmdb_tests.c @@ -12,8 +12,8 @@ #define _alloc(sz) malloc(sz) #define _free(ptr) free(ptr) -// default mmap size for lmdb (1GB) -#define LMDB_MAP_SIZE (1ULL << 30) +// default mmap size for lmdb (2GB) +#define LMDB_MAP_SIZE (1ULL << 31) /* _test_make_tmpdir(): create unique temporary directory for lmdb. ** @@ -278,8 +278,8 @@ main(int argc, char* argv[]) c3_i ret_i = 1; // benchmarks - ret_i &= _bench_write_speed(10000, 128); - ret_i &= _bench_write_speed_batched(10000, 1280, 100); + ret_i &= _bench_write_speed(1000, 128); + ret_i &= _bench_write_speed_batched(100000, 1280, 1000); fprintf(stderr, "\r\n"); if ( ret_i ) { From b37552bfc868d36c213ad96e6a4f235292f99fd4 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Mon, 2 Feb 2026 21:59:40 -0500 Subject: [PATCH 32/38] book: cleans double-buffer code --- pkg/vere/db/book.c | 312 +++++++++++---------------------------------- 1 file changed, 72 insertions(+), 240 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index f2ce5d17ba..af2abb4ffd 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -15,7 +15,7 @@ #include "noun.h" #include "ship.h" -// book: mostly append-only event log +// book: append-only event log // // simple file-based persistence layer for urbit's event log. // optimized for sequential writes and reads; no random access. @@ -27,8 +27,6 @@ // metadata stored in separate meta.bin file // -/* constants -*/ #define BOOK_MAGIC 0x424f4f4b // "BOOK" #define BOOK_VERSION 1 // format version @@ -37,21 +35,16 @@ #define BOOK_HEAD_B 4096 // second header slot #define BOOK_DEED_BASE 8192 // deeds start here -/* _book_head_crc(): compute header CRC32. -** -** computes CRC32 over all fields except crc_w. +/* _book_head_crc(): compute CRC32 of header fields. */ static c3_l _book_head_crc(const u3_book_head* hed_u) { - // checksum covers: mag_w, ver_w, fir_d, las_d, seq_d (28 bytes) c3_z len_z = offsetof(u3_book_head, crc_w); return (c3_l)crc32(0, (const c3_y*)hed_u, len_z); } -/* _book_head_okay(): validate header CRC and magic. -** -** returns: c3y if header is valid, c3n otherwise +/* _book_head_okay(): validate header magic, version, and checksum. */ static c3_o _book_head_okay(const u3_book_head* hed_u) @@ -72,10 +65,9 @@ _book_head_okay(const u3_book_head* hed_u) return c3y; } -/* _book_meta_path(): construct path to meta.bin from book directory path. +/* _book_meta_path(): construct path to metadata file. ** -** pax_c should be a directory path (the one passed to u3_book_init) -** caller must free result with c3_free() +** NB: caller must free the result. */ static c3_c* _book_meta_path(const c3_c* pax_c) @@ -86,14 +78,11 @@ _book_meta_path(const c3_c* pax_c) return 0; } - // pax_c is already the directory, just append /meta.bin snprintf(met_c, strlen(pax_c) + 16, "%s/meta.bin", pax_c); return met_c; } -/* _book_init_meta_file(): open/create meta.bin file. -** -** returns: file descriptor, or -1 on error +/* _book_init_meta_file(): open or create metadata file. */ static c3_i _book_init_meta_file(const c3_c* pax_c) @@ -106,7 +95,6 @@ _book_init_meta_file(const c3_c* pax_c) return -1; } - // check file size; if zero, initialize with blank metadata struct stat buf_u; if ( 0 > fstat(met_i, &buf_u) ) { goto fail; @@ -134,9 +122,7 @@ _book_init_meta_file(const c3_c* pax_c) return -1; } -/* _book_read_meta_file(): read metadata from meta.bin. -** -** returns: c3y on success, c3n on failure +/* _book_read_meta_file(): read metadata from disk. */ static c3_o _book_read_meta_file(c3_i met_i, u3_book_meta* met_u) @@ -153,9 +139,7 @@ _book_read_meta_file(c3_i met_i, u3_book_meta* met_u) return c3y; } -/* _book_save_meta_file(): write metadata to meta.bin. -** -** returns: c3y on success, c3n on failure +/* _book_save_meta_file(): write metadata to disk. */ static c3_o _book_save_meta_file(c3_i met_i, const u3_book_meta* met_u) @@ -178,15 +162,14 @@ _book_save_meta_file(c3_i met_i, const u3_book_meta* met_u) /* _book_make_head(): initialize and write both header slots for new file. ** -** fir_d and las_d start at 0, updated when first events are saved. -** both header slots are initialized identically with seq_d = 0. +** fir_d and las_d start at 0, updated when first events are saved. +** both header slots are initialized identically with seq_d = 0. */ static c3_o _book_make_head(u3_book* txt_u) { c3_zs ret_zs; - // initialize header memset(&txt_u->hed_u, 0, sizeof(u3_book_head)); txt_u->hed_u.mag_w = BOOK_MAGIC; txt_u->hed_u.ver_w = BOOK_VERSION; @@ -195,7 +178,6 @@ _book_make_head(u3_book* txt_u) txt_u->hed_u.seq_d = 0; txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - // write header slot A ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, sizeof(u3_book_head), BOOK_HEAD_A); @@ -205,7 +187,6 @@ _book_make_head(u3_book* txt_u) return c3n; } - // write header slot B (identical initially) ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, sizeof(u3_book_head), BOOK_HEAD_B); @@ -215,7 +196,7 @@ _book_make_head(u3_book* txt_u) return c3n; } - // extend file to BOOK_DEED_BASE so it passes minimum size check on reopen + // extend file so it passes minimum size check on reopen if ( -1 == ftruncate(txt_u->fid_i, BOOK_DEED_BASE) ) { u3l_log("book: failed to extend file: %s\r\n", strerror(errno)); @@ -228,37 +209,39 @@ _book_make_head(u3_book* txt_u) return c3n; } - // start with slot A as active - txt_u->act_w = 0; + txt_u->act_w = 0; // start with slot A as active return c3y; } -/* _book_okay_head(): validate header fields (verbose version). -** -** returns: c3y if valid, c3n otherwise (prints error message) +/* _book_take_head(): select valid header from two candidates. */ static c3_o -_book_okay_head(const u3_book_head* hed_u) +_book_take_head(const u3_book_head* hed_u, c3_o val_o, + const u3_book_head* deh_u, c3_o lav_o, + u3_book_head* out_u, c3_w* act_w) { - if ( BOOK_MAGIC != hed_u->mag_w ) { - fprintf(stderr, "book: invalid magic: 0x%08x\r\n", hed_u->mag_w); - return c3n; + if ( c3y == val_o && c3y == lav_o ) { + if ( hed_u->seq_d >= deh_u->seq_d ) { + *out_u = *hed_u; + if ( act_w ) *act_w = 0; // A + } else { + *out_u = *deh_u; + if ( act_w ) *act_w = 1; // B + } + return c3y; } - - if ( BOOK_VERSION != hed_u->ver_w ) { - fprintf(stderr, "book: unsupported version: %u\r\n", hed_u->ver_w); - return c3n; + if ( c3y == val_o ) { + *out_u = *hed_u; + if ( act_w ) *act_w = 0; // A + return c3y; } - - c3_w crc_w = _book_head_crc(hed_u); - if ( crc_w != hed_u->crc_w ) { - fprintf(stderr, "book: header checksum mismatch: 0x%08x != 0x%08x\r\n", - crc_w, hed_u->crc_w); - return c3n; + if ( c3y == lav_o ) { + *out_u = *deh_u; + if ( act_w ) *act_w = 1; // B + return c3y; } - - return c3y; + return c3n; } /* _book_read_head(): read both header slots and select valid one. @@ -277,7 +260,6 @@ _book_read_head(u3_book* txt_u) c3_o val_a, val_b; c3_zs ret_zs; - // read header slot A ret_zs = pread(txt_u->fid_i, &hed_a, sizeof(u3_book_head), BOOK_HEAD_A); if ( ret_zs != sizeof(u3_book_head) ) { fprintf(stderr, "book: failed to read header A\r\n"); @@ -287,7 +269,6 @@ _book_read_head(u3_book* txt_u) val_a = _book_head_okay(&hed_a); } - // read header slot B ret_zs = pread(txt_u->fid_i, &hed_b, sizeof(u3_book_head), BOOK_HEAD_B); if ( ret_zs != sizeof(u3_book_head) ) { fprintf(stderr, "book: failed to read header B\r\n"); @@ -297,27 +278,8 @@ _book_read_head(u3_book* txt_u) val_b = _book_head_okay(&hed_b); } - // select valid header with highest sequence number - if ( c3y == val_a && c3y == val_b ) { - // both valid: use higher sequence number - if ( hed_a.seq_d >= hed_b.seq_d ) { - txt_u->hed_u = hed_a; - txt_u->act_w = 0; - } - else { - txt_u->hed_u = hed_b; - txt_u->act_w = 1; - } - } - else if ( c3y == val_a ) { - txt_u->hed_u = hed_a; - txt_u->act_w = 0; - } - else if ( c3y == val_b ) { - txt_u->hed_u = hed_b; - txt_u->act_w = 1; - } - else { + if ( c3n == _book_take_head(&hed_a, val_a, &hed_b, val_b, + &txt_u->hed_u, &txt_u->act_w) ) { fprintf(stderr, "book: no valid header found\r\n"); return c3n; } @@ -330,30 +292,12 @@ _book_read_head(u3_book* txt_u) static inline c3_w _book_deed_size(c3_d len_d) { - // format: len_d (8) + buffer_data (len_d) + let_d (8) - // = 8 + len_d + 8 = len_d + 16 return sizeof(c3_d) + len_d + sizeof(c3_d); } -/* _book_okay_reed(): validate reed integrity. -*/ -static c3_o -_book_okay_reed(const u3_book_reed* red_u) -{ - // validate length - if ( 0 == red_u->len_d ) { - return c3n; - } - - return c3y; -} - -/* _book_reed_to_buff(): convert reed to byte buffer. -** -** allocates output buffer; caller must free. -** frees red_u->buf_y on success; caller must free on failure. +/* _book_reed_to_buff(): allocate buffer and copy deed data. ** -** returns: allocated buffer, or 0 on allocation failure +** NB: caller must free the returned buffer. */ static c3_y* _book_reed_to_buff(u3_book_reed* red_u, c3_z* len_z) @@ -384,9 +328,7 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) { c3_zs ret_zs; c3_d now_d = *off_d; - c3_d let_d; - // read deed head (len_d) c3_d len_d; ret_zs = pread(fid_i, &len_d, sizeof(c3_d), now_d); if ( ret_zs != sizeof(c3_d) ) { @@ -394,7 +336,6 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) } now_d += sizeof(c3_d); - // read complete buffer data red_u->buf_y = c3_malloc(len_d); if ( !red_u->buf_y ) { return c3n; @@ -406,28 +347,20 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) } now_d += len_d; - // read deed tail (let_d validation field) - c3_d let_d_read; - ret_zs = pread(fid_i, &let_d_read, sizeof(c3_d), now_d); + c3_d let_d; + ret_zs = pread(fid_i, &let_d, sizeof(c3_d), now_d); if ( ret_zs != sizeof(c3_d) ) { c3_free(red_u->buf_y); return c3n; } now_d += sizeof(c3_d); - // validate - let_d = let_d_read; - - // validate len_d == let_d if ( len_d != let_d ) { c3_free(red_u->buf_y); return c3n; } - // populate reed red_u->len_d = len_d; - - // update offset *off_d = now_d; return c3y; @@ -435,21 +368,14 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) /* _book_save_deed(): save complete deed to file using scatter-gather I/O. ** -** uses pwritev() to write head + jam + tail in a single syscall. -** -** returns: -** c3y: success -** c3n: failure +** uses pwritev() to write head + buffer + tail in a single syscall. */ static c3_o _book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) { - c3_d len_d = red_u->len_d; // complete buffer size - - // prepare deed tail (validation field) + c3_d len_d = red_u->len_d; c3_d let_d = len_d; - // build iovec for scatter-gather write: len_d + buffer + let_d struct iovec iov_u[3]; iov_u[0].iov_base = &len_d; iov_u[0].iov_len = sizeof(c3_d); @@ -469,11 +395,7 @@ _book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) return c3y; } -/* _book_skip_deed(): skip over deed without reading jam data. -** -** returns: -** c3y: success -** c3n: failure (EOF) +/* _book_skip_deed(): advance file offset past next deed without reading it. */ static c3_o _book_skip_deed(c3_i fid_i, c3_d* off_d) @@ -481,13 +403,11 @@ _book_skip_deed(c3_i fid_i, c3_d* off_d) c3_zs ret_zs; c3_d len_d; - // read only the len_d field ret_zs = pread(fid_i, &len_d, sizeof(c3_d), *off_d); if ( ret_zs != sizeof(c3_d) ) { return c3n; } - // skip entire deed: deed_head + jam + deed_tail *off_d += _book_deed_size(len_d); return c3y; @@ -516,7 +436,6 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) c3_d end_d; c3_d pos_d; - // get file size if ( -1 == fstat(txt_u->fid_i, &buf_u) ) { *off_d = BOOK_DEED_BASE; return c3n; @@ -539,21 +458,18 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) } pos_d = end_d; + c3_d min_size = sizeof(u3_book_deed) + sizeof(c3_d); - // scan backwards to validate last deed while ( pos_d > BOOK_DEED_BASE ) { c3_zs ret_zs; c3_d let_d; c3_d siz_d; - c3_d ded_d; // deed start offset - c3_d min_size = sizeof(u3_book_deed) + sizeof(c3_d); // minimum deed size + c3_d ded_d; - // need at least deed_tail size to read let_d if ( pos_d < BOOK_DEED_BASE + min_size ) { break; } - // read let_d from end of deed (last 8 bytes before pos_d) ret_zs = pread(txt_u->fid_i, &let_d, sizeof(c3_d), pos_d - sizeof(c3_d)); if ( ret_zs != sizeof(c3_d) ) { @@ -569,7 +485,6 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) ded_d = pos_d - siz_d; - // read and validate the deed { u3_book_reed red_u; c3_d tmp_d = ded_d; @@ -578,7 +493,7 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) break; } - if ( c3n == _book_okay_reed(&red_u) ) { + if ( 0 == red_u.len_d ) { c3_free(red_u.buf_y); break; } @@ -636,14 +551,11 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) u3_book_reed red_u; c3_d beg_d = cur_d; - // read deed into reed if ( c3n == _book_read_deed(txt_u->fid_i, &cur_d, &red_u) ) { - // EOF or read error break; } - // validate reed (CRC and length checks) - if ( c3n == _book_okay_reed(&red_u) ) { + if ( 0 == red_u.len_d ) { u3l_log("book: validation failed at offset %" PRIu64 "\r\n", beg_d); c3_free(red_u.buf_y); break; @@ -743,30 +655,25 @@ u3_book_init(const c3_c* pax_c) struct stat buf_u; u3_book* txt_u = 0; - // construct path to book.log snprintf(log_c, sizeof(log_c), "%s/book.log", pax_c); - // open or create file fid_i = c3_open(log_c, O_RDWR | O_CREAT, 0644); if ( 0 > fid_i ) { u3l_log("book: failed to open %s: %s\r\n", log_c, strerror(errno)); return 0; } - // open/create meta.bin file met_i = _book_init_meta_file(pax_c); if ( 0 > met_i ) { u3l_log("book: failed to open meta.bin\r\n"); goto fail1; } - // get file size if ( 0 > fstat(fid_i, &buf_u) ) { u3l_log("book: fstat failed: %s\r\n", strerror(errno)); goto fail2; } - // allocate log structure txt_u = c3_calloc(sizeof(u3_book)); txt_u->fid_i = fid_i; txt_u->met_i = met_i; @@ -855,7 +762,7 @@ u3_book_init(const c3_c* pax_c) return 0; } -/* u3_book_exit(): close event log. +/* u3_book_exit(): close event log and release resources. */ void u3_book_exit(u3_book* txt_u) @@ -864,20 +771,17 @@ u3_book_exit(u3_book* txt_u) return; } - // close book.log file close(txt_u->fid_i); - // close meta.bin file if ( 0 <= txt_u->met_i ) { close(txt_u->met_i); } - // free resources c3_free(txt_u->pax_c); c3_free(txt_u); } -/* u3_book_gulf(): read first and last event numbers. +/* u3_book_gulf(): read first and last event numbers from log. */ c3_o u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d) @@ -892,8 +796,6 @@ u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d) return c3y; } -/* u3_book_stat(): print book statistics. expects path to book.log. -*/ void u3_book_stat(const c3_c* log_c) { @@ -902,14 +804,12 @@ u3_book_stat(const c3_c* log_c) c3_o val_a, val_b; struct stat buf_u; - // open the file directly fid_i = c3_open(log_c, O_RDONLY, 0); if ( fid_i < 0 ) { fprintf(stderr, "book: failed to open %s: %s\r\n", log_c, strerror(errno)); return; } - // read both header slots and pick valid one c3_zs ret_zs; ret_zs = pread(fid_i, &hed_a, sizeof(u3_book_head), BOOK_HEAD_A); val_a = (ret_zs == sizeof(u3_book_head)) ? _book_head_okay(&hed_a) : c3n; @@ -917,16 +817,7 @@ u3_book_stat(const c3_c* log_c) ret_zs = pread(fid_i, &hed_b, sizeof(u3_book_head), BOOK_HEAD_B); val_b = (ret_zs == sizeof(u3_book_head)) ? _book_head_okay(&hed_b) : c3n; - if ( c3y == val_a && c3y == val_b ) { - hed_u = (hed_a.seq_d >= hed_b.seq_d) ? hed_a : hed_b; - } - else if ( c3y == val_a ) { - hed_u = hed_a; - } - else if ( c3y == val_b ) { - hed_u = hed_b; - } - else { + if ( c3n == _book_take_head(&hed_a, val_a, &hed_b, val_b, &hed_u, 0) ) { fprintf(stderr, "book: no valid header found\r\n"); close(fid_i); return; @@ -946,8 +837,6 @@ u3_book_stat(const c3_c* log_c) fprintf(stderr, " sequence: %" PRIu64 "\r\n", hed_u.seq_d); fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); - // read metadata from meta.bin - // extract directory from log_c path (lop off "/book.log" suffix) u3_book_meta met_u; c3_c* epo_c = 0; { @@ -1033,16 +922,12 @@ u3_book_save(u3_book* txt_u, // now_d = txt_u->off_d; - // max iovecs per pwritev call (use 1020 to be safe, divisible by 3) #define BOOK_IOV_MAX 1020 - c3_w max_deeds_w = BOOK_IOV_MAX / 3; // 340 deeds per call + c3_w max_deeds_w = BOOK_IOV_MAX / 3; - // allocate arrays for deed lengths and tails c3_d* len_u = c3_malloc(len_d * sizeof(c3_d)); c3_d* let_u = c3_malloc(len_d * sizeof(c3_d)); - // iovec array sized for one chunk (reused for each pwritev call) - // each deed needs 3 iovecs: len_d + buffer + let_d c3_w iov_max_w = (len_d < max_deeds_w) ? len_d * 3 : BOOK_IOV_MAX; struct iovec* iov_u = c3_malloc(iov_max_w * sizeof(struct iovec)); @@ -1054,12 +939,10 @@ u3_book_save(u3_book* txt_u, return c3n; } - // first pass: populate deed lengths and validation fields for ( c3_w i_w = 0; i_w < len_d; i_w++ ) { c3_y* buf_y = (c3_y*)byt_p[i_w]; c3_d siz_d = (c3_d)siz_i[i_w]; - // validate buffer size if ( siz_d < 4 ) { fprintf(stderr, "book: event %" PRIu64 " buffer too small: %" PRIu64 "\r\n", eve_d + i_w, siz_d); @@ -1069,28 +952,22 @@ u3_book_save(u3_book* txt_u, return c3n; } - // populate deed fields - len_u[i_w] = siz_d; // complete buffer size - - // populate deed tail validation field + len_u[i_w] = siz_d; let_u[i_w] = siz_d; } - // second pass: write in chunks to respect IOV_MAX - // each deed now uses 3 iovecs: len_d + buffer + let_d - #define DEEDS_PER_CHUNK (BOOK_IOV_MAX / 3) // 340 deeds per call - c3_w done_w = 0; // deeds written so far + #define DEEDS_PER_CHUNK (BOOK_IOV_MAX / 3) + c3_w dun_w = 0; - while ( done_w < len_d ) { - c3_w chunk_w = len_d - done_w; - if ( chunk_w > DEEDS_PER_CHUNK ) { - chunk_w = DEEDS_PER_CHUNK; + while ( dun_w < len_d ) { + c3_w cun_w = len_d - dun_w; + if ( cun_w > DEEDS_PER_CHUNK ) { + cun_w = DEEDS_PER_CHUNK; } - // build iovec for this chunk - c3_z chunk_z = 0; // bytes in this chunk - for ( c3_w i_w = 0; i_w < chunk_w; i_w++ ) { - c3_w src_w = done_w + i_w; + c3_z cun_z = 0; + for ( c3_w i_w = 0; i_w < cun_w; i_w++ ) { + c3_w src_w = dun_w + i_w; c3_w idx_w = i_w * 3; c3_y* buf_y = (c3_y*)byt_p[src_w]; @@ -1101,36 +978,34 @@ u3_book_save(u3_book* txt_u, iov_u[idx_w + 2].iov_base = &let_u[src_w]; iov_u[idx_w + 2].iov_len = sizeof(c3_d); - chunk_z += sizeof(c3_d) + siz_i[src_w] + sizeof(c3_d); + cun_z += sizeof(c3_d) + siz_i[src_w] + sizeof(c3_d); } - // pwritev for this chunk - c3_zs ret_zs = pwritev(txt_u->fid_i, iov_u, chunk_w * 3, now_d); + c3_zs ret_zs = pwritev(txt_u->fid_i, iov_u, cun_w * 3, now_d); - if ( ret_zs != (c3_zs)chunk_z ) { + if ( ret_zs != (c3_zs)cun_z ) { fprintf(stderr, "book: batch write failed: wrote %zd of %zu bytes: %s\r\n", - ret_zs, chunk_z, strerror(errno)); + ret_zs, cun_z, strerror(errno)); c3_free(len_u); c3_free(let_u); c3_free(iov_u); return c3n; } - now_d += chunk_z; - done_w += chunk_w; + now_d += cun_z; + dun_w += cun_w; } c3_free(len_u); c3_free(let_u); c3_free(iov_u); - // prepare new header for inactive slot c3_d new_las_d = eve_d + len_d - 1; txt_u->hed_u.las_d = new_las_d; txt_u->hed_u.seq_d++; txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - // write header to INACTIVE slot (double-buffer protocol) + // write to inactive slot (double-buffer protocol) c3_d slot_d = (txt_u->act_w == 0) ? BOOK_HEAD_B : BOOK_HEAD_A; if ( sizeof(u3_book_head) != pwrite(txt_u->fid_i, &txt_u->hed_u, sizeof(u3_book_head), slot_d) ) @@ -1139,29 +1014,20 @@ u3_book_save(u3_book* txt_u, return c3n; } - // SINGLE fsync: makes both deed data and new header durable atomically + // single fsync: makes both deed data and new header durable atomically if ( -1 == c3_sync(txt_u->fid_i) ) { fprintf(stderr, "book: failed to sync: %s\r\n", strerror(errno)); return c3n; } - // commit successful: switch active slot txt_u->act_w = (txt_u->act_w == 0) ? 1 : 0; - - // update cache txt_u->las_d = new_las_d; txt_u->off_d = now_d; return c3y; } -/* u3_book_read(): read [len_d] events starting at [eve_d]. -** -** invokes callback for each event with: -** ptr_v: context pointer -** eve_d: event number -** len_z: buffer size -** buf_v: buffer pointer +/* u3_book_read(): read events from log, invoking callback for each event. */ c3_o u3_book_read(u3_book* txt_u, @@ -1177,9 +1043,7 @@ u3_book_read(u3_book* txt_u, return c3n; } - // validate range if ( 0 == txt_u->las_d ) { - // empty log fprintf(stderr, "book: read from empty log\r\n"); return c3n; } @@ -1196,7 +1060,6 @@ u3_book_read(u3_book* txt_u, return c3n; } - // scan to starting event // NB: fir_d is the epoch base; first deed is event fir_d + 1 off_d = BOOK_DEED_BASE; cur_d = txt_u->hed_u.fir_d + 1; @@ -1209,33 +1072,28 @@ u3_book_read(u3_book* txt_u, cur_d++; } - // read requested events for ( c3_d i_d = 0; i_d < len_d; i_d++, cur_d++ ) { u3_book_reed red_u; c3_y* buf_y; c3_z len_z; - // read deed into reed if ( c3n == _book_read_deed(txt_u->fid_i, &off_d, &red_u) ) { fprintf(stderr, "book: failed to read event %" PRIu64 "\r\n", cur_d); return c3n; } - // validate reed - if ( c3n == _book_okay_reed(&red_u) ) { + if ( 0 == red_u.len_d ) { fprintf(stderr, "book: validation failed at event %" PRIu64 "\r\n", cur_d); c3_free(red_u.buf_y); return c3n; } - // convert to buffer format for callback buf_y = _book_reed_to_buff(&red_u, &len_z); if ( !buf_y ) { c3_free(red_u.buf_y); return c3n; } - // invoke callback if ( c3n == read_f(ptr_v, cur_d, len_z, buf_y) ) { c3_free(buf_y); return c3n; @@ -1247,11 +1105,6 @@ u3_book_read(u3_book* txt_u, return c3y; } -/* u3_book_read_meta(): read fixed metadata section via callback. -** -** key_c: metadata key -** invokes callback with (ptr_v, len, data) or (ptr_v, -1, 0) if not found. -*/ void u3_book_read_meta(u3_book* txt_u, void* ptr_v, @@ -1265,14 +1118,12 @@ u3_book_read_meta(u3_book* txt_u, return; } - // read metadata from meta.bin if ( c3n == _book_read_meta_file(txt_u->met_i, &met_u) ) { u3l_log("book: read_meta: failed to read metadata\r\n"); read_f(ptr_v, -1, 0); return; } - // match key and extract corresponding field if ( 0 == strcmp(key_c, "version") ) { read_f(ptr_v, sizeof(c3_w), &met_u.ver_w); } @@ -1291,10 +1142,6 @@ u3_book_read_meta(u3_book* txt_u, } /* u3_book_save_meta(): write fixed metadata section. -** -** key_c: metadata key -** val_z: value size in bytes -** val_p: pointer to value data */ c3_o u3_book_save_meta(u3_book* txt_u, @@ -1308,13 +1155,11 @@ u3_book_save_meta(u3_book* txt_u, return c3n; } - // read current metadata from meta.bin if ( c3n == _book_read_meta_file(txt_u->met_i, &met_u) ) { u3l_log("book: save_meta: failed to read current metadata\r\n"); return c3n; } - // update field based on key if ( 0 == strcmp(key_c, "version") ) { if ( val_z != sizeof(c3_w) ) return c3n; memcpy(&met_u.ver_w, val_p, val_z); @@ -1335,7 +1180,6 @@ u3_book_save_meta(u3_book* txt_u, return c3n; } - // write metadata to meta.bin if ( c3n == _book_save_meta_file(txt_u->met_i, &met_u) ) { u3l_log("book: save_meta: failed to write metadata\r\n"); return c3n; @@ -1345,8 +1189,6 @@ u3_book_save_meta(u3_book* txt_u, } /* u3_book_walk_init(): initialize event iterator. -** -** sets up iterator to read events from [nex_d] to [las_d] inclusive. */ c3_o u3_book_walk_init(u3_book* txt_u, @@ -1361,7 +1203,6 @@ u3_book_walk_init(u3_book* txt_u, return c3n; } - // validate range if ( 0 == txt_u->las_d ) { fprintf(stderr, "book: walk_init on empty log\r\n"); return c3n; @@ -1380,7 +1221,6 @@ u3_book_walk_init(u3_book* txt_u, return c3n; } - // scan to starting event // NB: fir_d is the epoch base; first deed is event fir_d + 1 off_d = BOOK_DEED_BASE; cur_d = txt_u->hed_u.fir_d + 1; @@ -1393,7 +1233,6 @@ u3_book_walk_init(u3_book* txt_u, cur_d++; } - // initialize iterator itr_u->fid_i = txt_u->fid_i; itr_u->nex_d = nex_d; itr_u->las_d = las_d; @@ -1418,13 +1257,11 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) return c3n; } - // check if we've reached the end if ( itr_u->nex_d > itr_u->las_d ) { itr_u->liv_o = c3n; return c3n; } - // read deed into reed if ( c3n == _book_read_deed(itr_u->fid_i, &itr_u->off_d, &red_u) ) { fprintf(stderr, "book: walk_next failed to read event %" PRIu64 "\r\n", itr_u->nex_d); @@ -1432,8 +1269,7 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) return c3n; } - // validate reed - if ( c3n == _book_okay_reed(&red_u) ) { + if ( 0 == red_u.len_d ) { fprintf(stderr, "book: walk_next validation failed at event %" PRIu64 "\r\n", itr_u->nex_d); c3_free(red_u.buf_y); @@ -1441,7 +1277,6 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) return c3n; } - // convert to buffer format buf_y = _book_reed_to_buff(&red_u, len_z); if ( !buf_y ) { c3_free(red_u.buf_y); @@ -1450,8 +1285,6 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) } *buf_v = buf_y; - - // advance to next event itr_u->nex_d++; return c3y; @@ -1466,7 +1299,6 @@ u3_book_walk_done(u3_book_walk* itr_u) return; } - // mark iterator as invalid itr_u->liv_o = c3n; itr_u->fid_i = -1; } From 70ff35710b3ea35b7665eab95289a48a17ecf025 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Wed, 4 Feb 2026 11:24:27 -0500 Subject: [PATCH 33/38] book: cleans entire api --- pkg/vere/db/book.c | 271 ++++++++++++++++----------------------------- 1 file changed, 94 insertions(+), 177 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index af2abb4ffd..e6489c376e 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -65,6 +65,34 @@ _book_head_okay(const u3_book_head* hed_u) return c3y; } +/* _book_save_head(): write header to inactive slot, sync, swap active. +** +** caller must set hed_u fields (e.g. las_d) before calling. +** increments seq_d, recomputes crc_w, and swaps active slot. +*/ +static c3_o +_book_save_head(u3_book* txt_u) +{ + txt_u->hed_u.seq_d++; + txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); + + c3_d slot_d = (txt_u->act_w == 0) ? BOOK_HEAD_B : BOOK_HEAD_A; + if ( sizeof(u3_book_head) != pwrite(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), slot_d) ) + { + fprintf(stderr, "book: failed to write header: %s\r\n", strerror(errno)); + return c3n; + } + + if ( -1 == c3_sync(txt_u->fid_i) ) { + fprintf(stderr, "book: failed to sync: %s\r\n", strerror(errno)); + return c3n; + } + + txt_u->act_w = (txt_u->act_w == 0) ? 1 : 0; + return c3y; +} + /* _book_meta_path(): construct path to metadata file. ** ** NB: caller must free the result. @@ -162,19 +190,22 @@ _book_save_meta_file(c3_i met_i, const u3_book_meta* met_u) /* _book_make_head(): initialize and write both header slots for new file. ** -** fir_d and las_d start at 0, updated when first events are saved. +** caller should set fir_d and las_d on txt_u->hed_u before calling +** (e.g. to epoch base for non-zero epochs, or 0 for fresh logs). ** both header slots are initialized identically with seq_d = 0. */ static c3_o _book_make_head(u3_book* txt_u) { c3_zs ret_zs; + c3_d fir_d = txt_u->hed_u.fir_d; + c3_d las_d = txt_u->hed_u.las_d; memset(&txt_u->hed_u, 0, sizeof(u3_book_head)); txt_u->hed_u.mag_w = BOOK_MAGIC; txt_u->hed_u.ver_w = BOOK_VERSION; - txt_u->hed_u.fir_d = 0; - txt_u->hed_u.las_d = 0; + txt_u->hed_u.fir_d = fir_d; + txt_u->hed_u.las_d = las_d; txt_u->hed_u.seq_d = 0; txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); @@ -295,25 +326,6 @@ _book_deed_size(c3_d len_d) return sizeof(c3_d) + len_d + sizeof(c3_d); } -/* _book_reed_to_buff(): allocate buffer and copy deed data. -** -** NB: caller must free the returned buffer. -*/ -static c3_y* -_book_reed_to_buff(u3_book_reed* red_u, c3_z* len_z) -{ - *len_z = red_u->len_d; - c3_y* buf_y = c3_malloc(*len_z); - - if ( !buf_y ) { - return 0; - } - - memcpy(buf_y, red_u->buf_y, red_u->len_d); - c3_free(red_u->buf_y); - - return buf_y; -} /* _book_read_deed(): read deed from file into [red_u]. ** @@ -366,35 +378,6 @@ _book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) return c3y; } -/* _book_save_deed(): save complete deed to file using scatter-gather I/O. -** -** uses pwritev() to write head + buffer + tail in a single syscall. -*/ -static c3_o -_book_save_deed(c3_i fid_i, c3_d* off_d, const u3_book_reed* red_u) -{ - c3_d len_d = red_u->len_d; - c3_d let_d = len_d; - - struct iovec iov_u[3]; - iov_u[0].iov_base = &len_d; - iov_u[0].iov_len = sizeof(c3_d); - iov_u[1].iov_base = red_u->buf_y; - iov_u[1].iov_len = len_d; - iov_u[2].iov_base = &let_d; - iov_u[2].iov_len = sizeof(c3_d); - - c3_z tot_z = sizeof(c3_d) + len_d + sizeof(c3_d); - c3_zs ret_zs = pwritev(fid_i, iov_u, 3, *off_d); - - if ( ret_zs != (c3_zs)tot_z ) { - return c3n; - } - - *off_d += tot_z; - return c3y; -} - /* _book_skip_deed(): advance file offset past next deed without reading it. */ static c3_o @@ -460,55 +443,53 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) pos_d = end_d; c3_d min_size = sizeof(u3_book_deed) + sizeof(c3_d); - while ( pos_d > BOOK_DEED_BASE ) { - c3_zs ret_zs; - c3_d let_d; - c3_d siz_d; - c3_d ded_d; - - if ( pos_d < BOOK_DEED_BASE + min_size ) { - break; - } + // validate last deed by reading its trailing length field + if ( pos_d < BOOK_DEED_BASE + min_size ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } - ret_zs = pread(txt_u->fid_i, &let_d, sizeof(c3_d), - pos_d - sizeof(c3_d)); - if ( ret_zs != sizeof(c3_d) ) { - break; - } + c3_zs ret_zs; + c3_d let_d; - // calculate deed size and start position - siz_d = _book_deed_size(let_d); - if ( siz_d > pos_d - BOOK_DEED_BASE ) { - // deed would extend before header - break; - } + ret_zs = pread(txt_u->fid_i, &let_d, sizeof(c3_d), + pos_d - sizeof(c3_d)); + if ( ret_zs != sizeof(c3_d) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } - ded_d = pos_d - siz_d; + // calculate deed size and start position + c3_d siz_d = _book_deed_size(let_d); + if ( siz_d > pos_d - BOOK_DEED_BASE ) { + // deed would extend before header + *off_d = BOOK_DEED_BASE; + return c3n; + } - { - u3_book_reed red_u; - c3_d tmp_d = ded_d; + c3_d ded_d = pos_d - siz_d; - if ( c3n == _book_read_deed(txt_u->fid_i, &tmp_d, &red_u) ) { - break; - } + { + u3_book_reed red_u; + c3_d tmp_d = ded_d; - if ( 0 == red_u.len_d ) { - c3_free(red_u.buf_y); - break; - } + if ( c3n == _book_read_deed(txt_u->fid_i, &tmp_d, &red_u) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } - // deed is valid — use header's las_d as authoritative + if ( 0 == red_u.len_d ) { c3_free(red_u.buf_y); - *off_d = pos_d; - txt_u->las_d = txt_u->hed_u.las_d; - return c3y; + *off_d = BOOK_DEED_BASE; + return c3n; } - } - // no valid deeds found - *off_d = BOOK_DEED_BASE; - return c3n; + // deed is valid — use header's las_d as authoritative + c3_free(red_u.buf_y); + *off_d = pos_d; + txt_u->las_d = txt_u->hed_u.las_d; + return c3y; + } } /* _book_scan_fore(): recovery forward scan to find last valid deed. @@ -593,22 +574,9 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) } } - // update header to match recovered state (write to inactive slot) + // update header to match recovered state txt_u->hed_u.las_d = las_d; - txt_u->hed_u.seq_d++; - txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - - c3_d slot_d = (txt_u->act_w == 0) ? BOOK_HEAD_B : BOOK_HEAD_A; - if ( sizeof(u3_book_head) != pwrite(txt_u->fid_i, &txt_u->hed_u, - sizeof(u3_book_head), slot_d) ) - { - u3l_log("book: failed to update header: %s\r\n", strerror(errno)); - } else { - if ( -1 == c3_sync(txt_u->fid_i) ) { - u3l_log("book: failed to sync header: %s\r\n", strerror(errno)); - } - txt_u->act_w = (txt_u->act_w == 0) ? 1 : 0; - } + _book_save_head(txt_u); } else { txt_u->las_d = las_d; } @@ -684,43 +652,19 @@ u3_book_init(const c3_c* pax_c) strcpy(txt_u->pax_c, log_c); if ( buf_u.st_size == 0 ) { - // new file: initialize and write header - if ( c3n == _book_make_head(txt_u) ) { - goto fail4; - } - // extract epoch number from path c3_d epo_d; if ( c3n == _book_pull_epoc(pax_c, &epo_d) ) { goto fail3; } - if ( epo_d ) { - // update header with epoch info and rewrite both slots - txt_u->hed_u.fir_d = epo_d; - txt_u->hed_u.las_d = epo_d; - txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - - // write header slot A - if ( sizeof(u3_book_head) != pwrite(fid_i, &txt_u->hed_u, - sizeof(u3_book_head), BOOK_HEAD_A) ) - { - u3l_log("book: failed to write header A: %s\r\n", strerror(errno)); - goto fail4; - } - - // write header slot B - if ( sizeof(u3_book_head) != pwrite(fid_i, &txt_u->hed_u, - sizeof(u3_book_head), BOOK_HEAD_B) ) - { - u3l_log("book: failed to write header B: %s\r\n", strerror(errno)); - goto fail4; - } + // set epoch fields before writing header + txt_u->hed_u.fir_d = epo_d; + txt_u->hed_u.las_d = epo_d; - if ( -1 == c3_sync(fid_i) ) { - u3l_log("book: failed to sync header: %s\r\n", strerror(errno)); - goto fail4; - } + // new file: initialize and write header + if ( c3n == _book_make_head(txt_u) ) { + goto fail4; } txt_u->las_d = epo_d; @@ -925,35 +869,30 @@ u3_book_save(u3_book* txt_u, #define BOOK_IOV_MAX 1020 c3_w max_deeds_w = BOOK_IOV_MAX / 3; - c3_d* len_u = c3_malloc(len_d * sizeof(c3_d)); - c3_d* let_u = c3_malloc(len_d * sizeof(c3_d)); + c3_d* siz_u = c3_malloc(len_d * sizeof(c3_d)); c3_w iov_max_w = (len_d < max_deeds_w) ? len_d * 3 : BOOK_IOV_MAX; struct iovec* iov_u = c3_malloc(iov_max_w * sizeof(struct iovec)); - if ( !len_u || !let_u || !iov_u ) { - c3_free(len_u); - c3_free(let_u); + if ( !siz_u || !iov_u ) { + c3_free(siz_u); c3_free(iov_u); fprintf(stderr, "book: failed to allocate batch write buffers\r\n"); return c3n; } for ( c3_w i_w = 0; i_w < len_d; i_w++ ) { - c3_y* buf_y = (c3_y*)byt_p[i_w]; c3_d siz_d = (c3_d)siz_i[i_w]; if ( siz_d < 4 ) { fprintf(stderr, "book: event %" PRIu64 " buffer too small: %" PRIu64 "\r\n", eve_d + i_w, siz_d); - c3_free(len_u); - c3_free(let_u); + c3_free(siz_u); c3_free(iov_u); return c3n; } - len_u[i_w] = siz_d; - let_u[i_w] = siz_d; + siz_u[i_w] = siz_d; } #define DEEDS_PER_CHUNK (BOOK_IOV_MAX / 3) @@ -971,11 +910,11 @@ u3_book_save(u3_book* txt_u, c3_w idx_w = i_w * 3; c3_y* buf_y = (c3_y*)byt_p[src_w]; - iov_u[idx_w + 0].iov_base = &len_u[src_w]; + iov_u[idx_w + 0].iov_base = &siz_u[src_w]; iov_u[idx_w + 0].iov_len = sizeof(c3_d); iov_u[idx_w + 1].iov_base = buf_y; iov_u[idx_w + 1].iov_len = siz_i[src_w]; - iov_u[idx_w + 2].iov_base = &let_u[src_w]; + iov_u[idx_w + 2].iov_base = &siz_u[src_w]; iov_u[idx_w + 2].iov_len = sizeof(c3_d); cun_z += sizeof(c3_d) + siz_i[src_w] + sizeof(c3_d); @@ -986,8 +925,7 @@ u3_book_save(u3_book* txt_u, if ( ret_zs != (c3_zs)cun_z ) { fprintf(stderr, "book: batch write failed: wrote %zd of %zu bytes: %s\r\n", ret_zs, cun_z, strerror(errno)); - c3_free(len_u); - c3_free(let_u); + c3_free(siz_u); c3_free(iov_u); return c3n; } @@ -996,31 +934,17 @@ u3_book_save(u3_book* txt_u, dun_w += cun_w; } - c3_free(len_u); - c3_free(let_u); + c3_free(siz_u); c3_free(iov_u); c3_d new_las_d = eve_d + len_d - 1; txt_u->hed_u.las_d = new_las_d; - txt_u->hed_u.seq_d++; - txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); - - // write to inactive slot (double-buffer protocol) - c3_d slot_d = (txt_u->act_w == 0) ? BOOK_HEAD_B : BOOK_HEAD_A; - if ( sizeof(u3_book_head) != pwrite(txt_u->fid_i, &txt_u->hed_u, - sizeof(u3_book_head), slot_d) ) - { - fprintf(stderr, "book: failed to write header: %s\r\n", strerror(errno)); - return c3n; - } - // single fsync: makes both deed data and new header durable atomically - if ( -1 == c3_sync(txt_u->fid_i) ) { - fprintf(stderr, "book: failed to sync: %s\r\n", strerror(errno)); + // commit header: write to inactive slot, fsync, swap active + if ( c3n == _book_save_head(txt_u) ) { return c3n; } - txt_u->act_w = (txt_u->act_w == 0) ? 1 : 0; txt_u->las_d = new_las_d; txt_u->off_d = now_d; @@ -1088,11 +1012,8 @@ u3_book_read(u3_book* txt_u, return c3n; } - buf_y = _book_reed_to_buff(&red_u, &len_z); - if ( !buf_y ) { - c3_free(red_u.buf_y); - return c3n; - } + len_z = red_u.len_d; + buf_y = red_u.buf_y; if ( c3n == read_f(ptr_v, cur_d, len_z, buf_y) ) { c3_free(buf_y); @@ -1277,12 +1198,8 @@ u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) return c3n; } - buf_y = _book_reed_to_buff(&red_u, len_z); - if ( !buf_y ) { - c3_free(red_u.buf_y); - itr_u->liv_o = c3n; - return c3n; - } + *len_z = red_u.len_d; + buf_y = red_u.buf_y; *buf_v = buf_y; itr_u->nex_d++; From cdbcb1fd8c8ef8826595d02b977fb76e7852786a Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Wed, 4 Feb 2026 21:12:03 -0500 Subject: [PATCH 34/38] book: adds `_bench_write_speed_mixed` according to event histogram from ~mastyr-bottec --- pkg/vere/book_tests.c | 158 +++++++++++++++++++++++++++++++++++++++++- pkg/vere/lmdb_tests.c | 158 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 312 insertions(+), 4 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index ab93076456..daed5b0c8a 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -1456,6 +1456,159 @@ _bench_write_speed_batched(c3_d num_d, c3_z siz_z, c3_d bat_d) return ret_i; } +/* _bench_write_speed_mixed(): benchmark mixed batch-size write performance. +** +** writes [num_d] events of [siz_z] bytes using a realistic distribution +** of batch sizes (1-9), interleaved via deterministic PRNG. +** reports total time, events/sec, MB/s, per-event latency, and save calls. +*/ +static c3_i +_bench_write_speed_mixed(c3_d num_d, c3_z siz_z) +{ + // batch size distribution from production telemetry + // + static const c3_d bat_d[9] = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + static const c3_d cnt_d[9] = { + 2128433, 407761, 234541, 89359, 41390, 21376, 10945, 5399, 5466 + }; + + // compute original total events for scaling + // + c3_d ori_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + ori_d += bat_d[i] * cnt_d[i]; + } + + // scale counts proportionally to num_d + // + c3_d rem_d[9]; + c3_d tot_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + rem_d[i] = (cnt_d[i] * num_d) / ori_d; + if ( (0 == rem_d[i]) && (cnt_d[i] > 0) ) { + rem_d[i] = 1; + } + tot_d += rem_d[i]; + } + + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " write_speed_mixed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffers for max batch size (9) + // + c3_y* evt_y[9]; + void* byt_p[9]; + c3_z siz_i[9]; + + for ( c3_d i = 0; i < 9; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // deterministic xorshift32 PRNG + // + c3_w rng_w = 12345; + + c3_d wit_d = 0; // events written + c3_d cal_d = 0; // save calls made + + // start timing + // + c3_d beg_d = _bench_get_time_ns(); + + while ( tot_d > 0 ) { + // xorshift32 step + // + rng_w ^= rng_w << 13; + rng_w ^= rng_w >> 17; + rng_w ^= rng_w << 5; + + // weighted selection from remaining counts + // + c3_d pick = (c3_d)rng_w % tot_d; + c3_d acc = 0; + c3_d idx = 0; + + for ( idx = 0; idx < 9; idx++ ) { + acc += rem_d[idx]; + if ( pick < acc ) break; + } + + c3_d bsz = bat_d[idx]; + rem_d[idx]--; + tot_d--; + + // update mug patterns in event buffers + // + for ( c3_d j = 0; j < bsz; j++ ) { + c3_w mug_w = (c3_w)((wit_d + j + 1) * 0x12345678); + memcpy(evt_y[j], &mug_w, 4); + } + + c3_o sav_o = u3_book_save(txt_u, wit_d + 1, bsz, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_mixed: save failed at event %" PRIu64 "\r\n", + wit_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + wit_d += bsz; + cal_d++; + } + + // end timing + // + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; + + // calculate metrics + // + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)wit_d / elapsed_sec; + double total_bytes = (double)wit_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)wit_d; + + // report results + // + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (mixed batch sizes 1-9):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", wit_d); + fprintf(stderr, " save calls: %" PRIu64 "\r\n", cal_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < 9; i++ ) { + _free(evt_y[i]); + } + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_mixed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + //============================================================================== // Main //============================================================================== @@ -1491,8 +1644,9 @@ main(int argc, char* argv[]) ret_i &= _test_metadata_size_validation(); // benchmarks - ret_i &= _bench_write_speed(1000, 128); - ret_i &= _bench_write_speed_batched(100000, 1280, 1000); + // ret_i &= _bench_write_speed(1000, 128); + // ret_i &= _bench_write_speed_batched(100000, 1280, 1000); + ret_i &= _bench_write_speed_mixed(10000, 128); fprintf(stderr, "\r\n"); if ( ret_i ) { diff --git a/pkg/vere/lmdb_tests.c b/pkg/vere/lmdb_tests.c index 2ad65a6f16..03778bc4dd 100644 --- a/pkg/vere/lmdb_tests.c +++ b/pkg/vere/lmdb_tests.c @@ -268,6 +268,159 @@ _bench_write_speed_batched(c3_d num_d, c3_z siz_z, c3_d bat_d) return ret_i; } +/* _bench_write_speed_mixed(): benchmark mixed batch-size write performance. +** +** writes [num_d] events of [siz_z] bytes using a realistic distribution +** of batch sizes (1-9), interleaved via deterministic PRNG. +** reports total time, events/sec, MB/s, per-event latency, and save calls. +*/ +static c3_i +_bench_write_speed_mixed(c3_d num_d, c3_z siz_z) +{ + // batch size distribution from production telemetry + // + static const c3_d bat_d[9] = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + static const c3_d cnt_d[9] = { + 2128433, 407761, 234541, 89359, 41390, 21376, 10945, 5399, 5466 + }; + + // compute original total events for scaling + // + c3_d ori_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + ori_d += bat_d[i] * cnt_d[i]; + } + + // scale counts proportionally to num_d + // + c3_d rem_d[9]; + c3_d tot_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + rem_d[i] = (cnt_d[i] * num_d) / ori_d; + if ( (0 == rem_d[i]) && (cnt_d[i] > 0) ) { + rem_d[i] = 1; + } + tot_d += rem_d[i]; + } + + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + MDB_env* env_u = u3_lmdb_init(dir_c, LMDB_MAP_SIZE); + + if ( !env_u ) { + fprintf(stderr, " write_speed_mixed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffers for max batch size (9) + // + c3_y* evt_y[9]; + void* byt_p[9]; + size_t siz_i[9]; + + for ( c3_d i = 0; i < 9; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // deterministic xorshift32 PRNG + // + c3_w rng_w = 12345; + + c3_d wit_d = 0; // events written + c3_d cal_d = 0; // save calls made + + // start timing + // + c3_d beg_d = _bench_get_time_ns(); + + while ( tot_d > 0 ) { + // xorshift32 step + // + rng_w ^= rng_w << 13; + rng_w ^= rng_w >> 17; + rng_w ^= rng_w << 5; + + // weighted selection from remaining counts + // + c3_d pick = (c3_d)rng_w % tot_d; + c3_d acc = 0; + c3_d idx = 0; + + for ( idx = 0; idx < 9; idx++ ) { + acc += rem_d[idx]; + if ( pick < acc ) break; + } + + c3_d bsz = bat_d[idx]; + rem_d[idx]--; + tot_d--; + + // update mug patterns in event buffers + // + for ( c3_d j = 0; j < bsz; j++ ) { + c3_w mug_w = (c3_w)((wit_d + j + 1) * 0x12345678); + memcpy(evt_y[j], &mug_w, 4); + } + + c3_o sav_o = u3_lmdb_save(env_u, wit_d + 1, bsz, byt_p, siz_i); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_mixed: save failed at event %" PRIu64 "\r\n", + wit_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + wit_d += bsz; + cal_d++; + } + + // end timing + // + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; + + // calculate metrics + // + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)wit_d / elapsed_sec; + double total_bytes = (double)wit_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)wit_d; + + // report results + // + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (mixed batch sizes 1-9):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", wit_d); + fprintf(stderr, " save calls: %" PRIu64 "\r\n", cal_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < 9; i++ ) { + _free(evt_y[i]); + } + + u3_lmdb_exit(env_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_mixed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + //============================================================================== // Main //============================================================================== @@ -278,8 +431,9 @@ main(int argc, char* argv[]) c3_i ret_i = 1; // benchmarks - ret_i &= _bench_write_speed(1000, 128); - ret_i &= _bench_write_speed_batched(100000, 1280, 1000); + // ret_i &= _bench_write_speed(1000, 128); + // ret_i &= _bench_write_speed_batched(100000, 1280, 1000); + ret_i &= _bench_write_speed_mixed(10000, 128); fprintf(stderr, "\r\n"); if ( ret_i ) { From 58b7efda65aaff85fc591dc81dd6c76c805ecb0f Mon Sep 17 00:00:00 2001 From: Quodss Date: Thu, 5 Feb 2026 08:33:25 +0100 Subject: [PATCH 35/38] book: simplify batched writes --- pkg/vere/db/book.c | 50 +++++++++++----------------------------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index e6489c376e..cfe5a71022 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -824,6 +824,9 @@ u3_book_stat(const c3_c* log_c) ** 2. write updated header to INACTIVE slot ** 3. single fsync makes both durable atomically */ + +static_assert(sizeof(c3_d) == sizeof(c3_z)); + c3_o u3_book_save(u3_book* txt_u, c3_d eve_d, @@ -866,43 +869,19 @@ u3_book_save(u3_book* txt_u, // now_d = txt_u->off_d; - #define BOOK_IOV_MAX 1020 - c3_w max_deeds_w = BOOK_IOV_MAX / 3; + #ifdef IOV_MAX + const c3_w max_deeds_w = IOV_MAX / 3; + #else + const c3_w max_deeds_w = 1020 / 3; + #endif - c3_d* siz_u = c3_malloc(len_d * sizeof(c3_d)); + struct iovec iov_u[max_deeds_w * 3]; - c3_w iov_max_w = (len_d < max_deeds_w) ? len_d * 3 : BOOK_IOV_MAX; - struct iovec* iov_u = c3_malloc(iov_max_w * sizeof(struct iovec)); - if ( !siz_u || !iov_u ) { - c3_free(siz_u); - c3_free(iov_u); - fprintf(stderr, "book: failed to allocate batch write buffers\r\n"); - return c3n; - } - - for ( c3_w i_w = 0; i_w < len_d; i_w++ ) { - c3_d siz_d = (c3_d)siz_i[i_w]; - - if ( siz_d < 4 ) { - fprintf(stderr, "book: event %" PRIu64 " buffer too small: %" PRIu64 "\r\n", - eve_d + i_w, siz_d); - c3_free(siz_u); - c3_free(iov_u); - return c3n; - } - - siz_u[i_w] = siz_d; - } - - #define DEEDS_PER_CHUNK (BOOK_IOV_MAX / 3) c3_w dun_w = 0; while ( dun_w < len_d ) { - c3_w cun_w = len_d - dun_w; - if ( cun_w > DEEDS_PER_CHUNK ) { - cun_w = DEEDS_PER_CHUNK; - } + c3_w cun_w = c3_min(len_d - dun_w, max_deeds_w); c3_z cun_z = 0; for ( c3_w i_w = 0; i_w < cun_w; i_w++ ) { @@ -910,11 +889,11 @@ u3_book_save(u3_book* txt_u, c3_w idx_w = i_w * 3; c3_y* buf_y = (c3_y*)byt_p[src_w]; - iov_u[idx_w + 0].iov_base = &siz_u[src_w]; + iov_u[idx_w + 0].iov_base = &siz_i[src_w]; iov_u[idx_w + 0].iov_len = sizeof(c3_d); iov_u[idx_w + 1].iov_base = buf_y; iov_u[idx_w + 1].iov_len = siz_i[src_w]; - iov_u[idx_w + 2].iov_base = &siz_u[src_w]; + iov_u[idx_w + 2].iov_base = &siz_i[src_w]; iov_u[idx_w + 2].iov_len = sizeof(c3_d); cun_z += sizeof(c3_d) + siz_i[src_w] + sizeof(c3_d); @@ -925,8 +904,6 @@ u3_book_save(u3_book* txt_u, if ( ret_zs != (c3_zs)cun_z ) { fprintf(stderr, "book: batch write failed: wrote %zd of %zu bytes: %s\r\n", ret_zs, cun_z, strerror(errno)); - c3_free(siz_u); - c3_free(iov_u); return c3n; } @@ -934,9 +911,6 @@ u3_book_save(u3_book* txt_u, dun_w += cun_w; } - c3_free(siz_u); - c3_free(iov_u); - c3_d new_las_d = eve_d + len_d - 1; txt_u->hed_u.las_d = new_las_d; From 65e4f15e00c73fd970f625c63e9f371c8221b9d6 Mon Sep 17 00:00:00 2001 From: Quodss Date: Thu, 5 Feb 2026 15:31:42 +0100 Subject: [PATCH 36/38] book: windows compatibility --- pkg/c3/platform/windows/compat.c | 51 +++++++++++++++++++++++++++----- pkg/c3/platform/windows/compat.h | 3 ++ pkg/vere/db/book.c | 4 ++- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pkg/c3/platform/windows/compat.c b/pkg/c3/platform/windows/compat.c index d47f3f8f28..795d09d3e1 100644 --- a/pkg/c3/platform/windows/compat.c +++ b/pkg/c3/platform/windows/compat.c @@ -5,6 +5,16 @@ #include #include "errno.h" +static void +ov_from_off_t(OVERLAPPED* ov, off_t offset) +{ + ov->OffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((offset >> 32) & 0xFFFFFFFFL); + + ov->Offset = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)offset : (DWORD)(offset & 0xFFFFFFFFL); +} + // set default CRT file mode to binary // note that mingw binmode.o does nothing #undef _fmode @@ -560,10 +570,7 @@ ssize_t pread(int fd, void *buf, size_t count, off_t offset) OVERLAPPED overlapped = {0}; - overlapped.OffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)0 : (DWORD)((offset >> 32) & 0xFFFFFFFFL); - overlapped.Offset = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)offset : (DWORD)(offset & 0xFFFFFFFFL); + ov_from_off_t(&overlapped, offset); HANDLE h = (HANDLE)_get_osfhandle(fd); @@ -590,10 +597,7 @@ ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) OVERLAPPED overlapped = {0}; - overlapped.OffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)0 : (DWORD)((offset >> 32) & 0xFFFFFFFFL); - overlapped.Offset = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)offset : (DWORD)(offset & 0xFFFFFFFFL); + ov_from_off_t(&overlapped, offset); HANDLE h = (HANDLE)_get_osfhandle(fd); @@ -609,3 +613,34 @@ ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) return (ssize_t)len; } + +ssize_t pwritev(int fd, const struct iovec* iov, size_t iovcnt, off_t offset) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + + if ( INVALID_HANDLE_VALUE == h ) { + errno = EBADF; + return -1; + } + + DWORD written; + ssize_t len = 0; + OVERLAPPED ov = {0}; + + for (size_t i = 0; i < iovcnt; i++) { + ov_from_off_t(&ov, offset); + DWORD len_write = (DWORD)iov[i].iov_len; // XX chunk on large writes? + void* buf = iov[i].iov_base; + if ( !WriteFile(h, buf, len_write, &written, &ov) ) { + errno = err_win_to_posix(GetLastError()); + return -1; + } + + len += written; + offset += written; + + if ( written < iov[i].iov_len ) break; + } + + return len; +} \ No newline at end of file diff --git a/pkg/c3/platform/windows/compat.h b/pkg/c3/platform/windows/compat.h index 17338b8e16..94d31063a3 100644 --- a/pkg/c3/platform/windows/compat.h +++ b/pkg/c3/platform/windows/compat.h @@ -15,8 +15,11 @@ void *memmem(const void *h0, size_t k, const void *n0, size_t l); uint32_t getppid(); +struct iovec { void *iov_base; size_t iov_len; }; + ssize_t pread(int fd, void *buf, size_t count, off_t offset); ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); +ssize_t pwritev(int fd, const struct iovec* iov, size_t iovcnt, off_t offset); #define SIGUSR1 10 #define SIGALRM 14 diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index cfe5a71022..b9350bfd26 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -3,7 +3,9 @@ #include "db/book.h" #include -#include +#ifndef U3_OS_windows +# include +#endif #include #include #include From 23d7e9ed4c2f5a78223ee86a2d7154785422450f Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 6 Feb 2026 10:14:57 -0500 Subject: [PATCH 37/38] book: uses `c3_d` by default --- pkg/vere/db/book.c | 64 ++++++++++++++++++++-------------------------- pkg/vere/db/book.h | 2 +- 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index b9350bfd26..14403fccf0 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -322,7 +322,7 @@ _book_read_head(u3_book* txt_u) /* _book_deed_size(): calculate total on-disk size of deed. */ -static inline c3_w +static inline c3_d _book_deed_size(c3_d len_d) { return sizeof(c3_d) + len_d + sizeof(c3_d); @@ -818,9 +818,6 @@ u3_book_stat(const c3_c* log_c) /* u3_book_save(): save [len_d] events starting at [eve_d]. ** -** byt_p: array of buffers -** siz_i: array of buffer sizes -** ** uses double-buffered headers for single-fsync commits: ** 1. write deed data ** 2. write updated header to INACTIVE slot @@ -831,14 +828,12 @@ static_assert(sizeof(c3_d) == sizeof(c3_z)); c3_o u3_book_save(u3_book* txt_u, - c3_d eve_d, - c3_d len_d, - void** byt_p, - c3_z* siz_i, - c3_d epo_d) + c3_d eve_d, // first event + c3_d len_d, // number of events + void** byt_p, // array of bytes + c3_z* siz_i, // array of lengths + c3_d epo_d) // target epoch { - c3_d now_d; - if ( !txt_u ) { return c3n; } @@ -869,39 +864,36 @@ u3_book_save(u3_book* txt_u, // for each deed we need 3 iovec entries: len_d + buffer + let_d // pwritev has IOV_MAX limit (typically 1024), so we chunk if needed // - now_d = txt_u->off_d; - #ifdef IOV_MAX - const c3_w max_deeds_w = IOV_MAX / 3; + const c3_d max_ded_d = IOV_MAX / 3; #else - const c3_w max_deeds_w = 1020 / 3; + const c3_d max_ded_d = 1020 / 3; #endif - struct iovec iov_u[max_deeds_w * 3]; - - - c3_w dun_w = 0; + struct iovec iov_u[max_ded_d * 3]; + c3_d now_d = txt_u->off_d; + c3_d dun_d = 0; - while ( dun_w < len_d ) { - c3_w cun_w = c3_min(len_d - dun_w, max_deeds_w); + while ( dun_d < len_d ) { + c3_d cun_d = c3_min(len_d - dun_d, max_ded_d); c3_z cun_z = 0; - for ( c3_w i_w = 0; i_w < cun_w; i_w++ ) { - c3_w src_w = dun_w + i_w; - c3_w idx_w = i_w * 3; - c3_y* buf_y = (c3_y*)byt_p[src_w]; - - iov_u[idx_w + 0].iov_base = &siz_i[src_w]; - iov_u[idx_w + 0].iov_len = sizeof(c3_d); - iov_u[idx_w + 1].iov_base = buf_y; - iov_u[idx_w + 1].iov_len = siz_i[src_w]; - iov_u[idx_w + 2].iov_base = &siz_i[src_w]; - iov_u[idx_w + 2].iov_len = sizeof(c3_d); - - cun_z += sizeof(c3_d) + siz_i[src_w] + sizeof(c3_d); + for ( c3_d i_d = 0; i_d < cun_d; i_d++ ) { + c3_d src_d = dun_d + i_d; + c3_d idx_d = i_d * 3; + c3_y* buf_y = (c3_y*)byt_p[src_d]; + + iov_u[idx_d + 0].iov_base = &siz_i[src_d]; + iov_u[idx_d + 0].iov_len = sizeof(c3_d); + iov_u[idx_d + 1].iov_base = buf_y; + iov_u[idx_d + 1].iov_len = siz_i[src_d]; + iov_u[idx_d + 2].iov_base = &siz_i[src_d]; + iov_u[idx_d + 2].iov_len = sizeof(c3_d); + + cun_z += sizeof(c3_d) + siz_i[src_d] + sizeof(c3_d); } - c3_zs ret_zs = pwritev(txt_u->fid_i, iov_u, cun_w * 3, now_d); + c3_zs ret_zs = pwritev(txt_u->fid_i, iov_u, cun_d * 3, now_d); if ( ret_zs != (c3_zs)cun_z ) { fprintf(stderr, "book: batch write failed: wrote %zd of %zu bytes: %s\r\n", @@ -910,7 +902,7 @@ u3_book_save(u3_book* txt_u, } now_d += cun_z; - dun_w += cun_w; + dun_d += cun_d; } c3_d new_las_d = eve_d + len_d - 1; diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index c4284a4704..5f91fa00ad 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -56,7 +56,7 @@ u3_book_head hed_u; // cached header (current valid state) c3_d las_d; // cached last event number c3_d off_d; // cached append offset (end of last event) - c3_w act_w; // active header slot (0 or 1) + c3_w act_w; // active header slot a or b (0 or 1) } u3_book; /* u3_book_walk: event iterator From 71c086b58ff9ad78d69194c752924a357dde8f54 Mon Sep 17 00:00:00 2001 From: Matthew LeVan Date: Fri, 13 Feb 2026 08:09:51 -0500 Subject: [PATCH 38/38] book: adds last batch checksum with validation --- pkg/vere/book_tests.c | 133 ++++++++++++++++++++++++++++++++- pkg/vere/db/book.c | 169 +++++++++++++++++++++++++++++++++++++++++- pkg/vere/db/book.h | 20 +++-- 3 files changed, 312 insertions(+), 10 deletions(-) diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c index daed5b0c8a..741660211b 100644 --- a/pkg/vere/book_tests.c +++ b/pkg/vere/book_tests.c @@ -1241,6 +1241,136 @@ _test_metadata_size_validation(void) return ret_i; } +/* _test_partial_batch_recovery(): simulate power failure where header is +** flushed but deed data is corrupt. +** +** writes two batches: event 1 (batch 1), events 2-3 (batch 2). +** corrupts deed 3's buffer data while keeping its len_d/let_d framing +** intact. on reopen, the batch 2 checksum should fail and recovery +** should roll back batch 2, leaving only event 1. +*/ +static c3_i +_test_partial_batch_recovery(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* ev1_y = 0; + c3_y* ev2_y = 0; + c3_y* ev3_y = 0; + c3_z siz_z; + c3_c pax_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " partial_batch: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // batch 1: write event 1 + ev1_y = _test_make_event(&siz_z, 1); + { + void* byt_p[1] = { ev1_y }; + c3_z siz_i[1] = { siz_z }; + + if ( c3n == u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0) ) { + fprintf(stderr, " partial_batch: save batch 1 failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // batch 2: write events 2-3 + ev2_y = _test_make_event(&siz_z, 2); + ev3_y = _test_make_event(&siz_z, 3); + { + void* byt_p[2] = { ev2_y, ev3_y }; + c3_z siz_i[2] = { siz_z, siz_z }; + + if ( c3n == u3_book_save(txt_u, 2, 2, byt_p, siz_i, 0) ) { + fprintf(stderr, " partial_batch: save batch 2 failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + u3_book_exit(txt_u); + txt_u = 0; + + // corrupt deed 3's buffer data while keeping framing intact + // + // each deed: len_d (8) + buffer (siz_z) + let_d (8) + // + snprintf(pax_c, sizeof(pax_c), "%s/book.log", dir_c); + { + c3_d ded_d = 8 + siz_z + 8; + c3_d dee_d = BOOK_DEED_BASE + (ded_d * 2); + c3_d buf_d = dee_d + 8; + + c3_y jnk_y[64]; + memset(jnk_y, 0xFF, sizeof(jnk_y)); + if ( c3n == _test_write_raw(pax_c, buf_d, jnk_y, siz_z) ) { + fprintf(stderr, " partial_batch: corrupt failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // reopen — batch 2 checksum should fail, recovery rolls back batch 2 + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " partial_batch: reopen failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // verify only event 1 remains (batch 2 rolled back) + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( hig_d != 1 ) { + fprintf(stderr, " partial_batch: expected hig=1, got %" PRIu64 "\r\n", hig_d); + ret_i = 0; + } + } + + // verify event 1 is still readable and correct + { + _test_read_ctx ctx_u = {0}; + + if ( c3n == u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb) ) { + fprintf(stderr, " partial_batch: read event 1 failed\r\n"); + ret_i = 0; + } + else { + if ( ctx_u.len_z != siz_z || + 0 != memcmp(ctx_u.buf_y, ev1_y, siz_z) ) + { + fprintf(stderr, " partial_batch: event 1 data mismatch\r\n"); + ret_i = 0; + } + _free(ctx_u.buf_y); + } + } + + u3_book_exit(txt_u); + txt_u = 0; + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( ev1_y ) _free(ev1_y); + if ( ev2_y ) _free(ev2_y); + if ( ev3_y ) _free(ev3_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " partial_batch_recovery: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + //============================================================================== // Benchmarks //============================================================================== @@ -1627,6 +1757,7 @@ main(int argc, char* argv[]) // crash recovery tests ret_i &= _test_truncated_file_recovery(); + ret_i &= _test_partial_batch_recovery(); // iterator tests ret_i &= _test_walk_single_event(); @@ -1646,7 +1777,7 @@ main(int argc, char* argv[]) // benchmarks // ret_i &= _bench_write_speed(1000, 128); // ret_i &= _bench_write_speed_batched(100000, 1280, 1000); - ret_i &= _bench_write_speed_mixed(10000, 128); + // ret_i &= _bench_write_speed_mixed(10000, 128); fprintf(stderr, "\r\n"); if ( ret_i ) { diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c index 14403fccf0..c1055ec61f 100644 --- a/pkg/vere/db/book.c +++ b/pkg/vere/db/book.c @@ -486,8 +486,62 @@ _book_scan_back(u3_book* txt_u, c3_d* off_d) return c3n; } - // deed is valid — use header's las_d as authoritative + // deed is valid — verify batch checksum before accepting c3_free(red_u.buf_y); + + if ( txt_u->hed_u.bat_w > 0 ) { + // walk backward through bat_w deeds to find batch start + c3_d cur_d = pos_d; + + for ( c3_d i_d = 0; i_d < txt_u->hed_u.bat_w; i_d++ ) { + if ( cur_d < BOOK_DEED_BASE + sizeof(c3_d) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_d tet_d; + ret_zs = pread(txt_u->fid_i, &tet_d, sizeof(c3_d), + cur_d - sizeof(c3_d)); + if ( ret_zs != sizeof(c3_d) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_d ded_d = _book_deed_size(tet_d); + if ( ded_d > cur_d - BOOK_DEED_BASE ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + cur_d -= ded_d; + } + + // read the batch region and verify checksum + c3_d byt_d = pos_d - cur_d; + c3_y* bat_y = c3_malloc(byt_d); + + if ( !bat_y ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + ret_zs = pread(txt_u->fid_i, bat_y, byt_d, cur_d); + if ( ret_zs != (c3_zs)byt_d ) { + c3_free(bat_y); + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_w sum_w = (c3_w)crc32(0, bat_y, byt_d); + c3_free(bat_y); + + if ( sum_w != txt_u->hed_u.sum_w ) { + fprintf(stderr, "book: batch checksum mismatch\r\n"); + *off_d = BOOK_DEED_BASE; + return c3n; + } + } + *off_d = pos_d; txt_u->las_d = txt_u->hed_u.las_d; return c3y; @@ -578,6 +632,8 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) // update header to match recovered state txt_u->hed_u.las_d = las_d; + txt_u->hed_u.sum_w = 0; + txt_u->hed_u.bat_w = 0; _book_save_head(txt_u); } else { txt_u->las_d = las_d; @@ -587,6 +643,106 @@ _book_scan_fore(u3_book* txt_u, c3_d* off_d) return c3y; } +/* _book_check_batch(): verify batch integrity and roll back if corrupt. +** +** verifies that the latest batch of deeds matches the checksum +** stored in the header. if the checksum fails, truncates the +** file to remove the corrupt batch and updates the header. +** +** this protects against power failure where the header is flushed +** to disk but deed data is only partially written. +*/ +static void +_book_check_batch(u3_book* txt_u) +{ + if ( 0 == txt_u->hed_u.bat_w ) { + return; + } + + // walk backward through bat_w deeds to find batch start + c3_d cur_d = txt_u->off_d; + + for ( c3_d i_d = 0; i_d < txt_u->hed_u.bat_w; i_d++ ) { + if ( cur_d < BOOK_DEED_BASE + sizeof(c3_d) ) { + return; + } + + c3_d tet_d; + c3_zs ret_zs = pread(txt_u->fid_i, &tet_d, sizeof(c3_d), + cur_d - sizeof(c3_d)); + if ( ret_zs != sizeof(c3_d) ) { + return; + } + + c3_d ded_d = _book_deed_size(tet_d); + if ( ded_d > cur_d - BOOK_DEED_BASE ) { + return; + } + + cur_d -= ded_d; + } + + // read the batch region and verify checksum + c3_d byt_d = txt_u->off_d - cur_d; + c3_y* bat_y = c3_malloc(byt_d); + + if ( !bat_y ) { + return; + } + + c3_zs ret_zs = pread(txt_u->fid_i, bat_y, byt_d, cur_d); + if ( ret_zs != (c3_zs)byt_d ) { + c3_free(bat_y); + return; + } + + c3_w sum_w = (c3_w)crc32(0, bat_y, byt_d); + c3_free(bat_y); + + if ( sum_w == txt_u->hed_u.sum_w ) { + return; // checksum valid + } + + // batch is corrupt — roll back + u3l_log("book: batch checksum mismatch, rolling back\r\n"); + + // count valid events before the corrupt batch + c3_d pre_d = 0; + c3_d pos_d = BOOK_DEED_BASE; + + while ( pos_d < cur_d ) { + c3_d len_d; + ret_zs = pread(txt_u->fid_i, &len_d, sizeof(c3_d), pos_d); + if ( ret_zs != sizeof(c3_d) || 0 == len_d ) { + break; + } + + c3_d siz_d = _book_deed_size(len_d); + if ( pos_d + siz_d > cur_d ) { + break; + } + + pos_d += siz_d; + pre_d++; + } + + c3_d las_d = ( pre_d > 0 ) + ? txt_u->hed_u.fir_d + pre_d + : txt_u->hed_u.fir_d; + + // truncate and update state + if ( -1 != ftruncate(txt_u->fid_i, cur_d) ) { + c3_sync(txt_u->fid_i); + } + + txt_u->off_d = cur_d; + txt_u->las_d = las_d; + txt_u->hed_u.las_d = las_d; + txt_u->hed_u.sum_w = 0; + txt_u->hed_u.bat_w = 0; + _book_save_head(txt_u); +} + /* _book_pull_epoc(): parse epoch number from directory path. ** ** expects path ending in "0iN" where N is the epoch number. @@ -689,6 +845,10 @@ u3_book_init(const c3_c* pax_c) _book_scan_fore(txt_u, &txt_u->off_d); } + // verify latest batch integrity (catches content corruption + // that structural checks miss, e.g. header flushed but deeds not) + _book_check_batch(txt_u); + // fir_d pre-initialized but no events found: set las_d to match if ( txt_u->hed_u.fir_d && !txt_u->las_d ) { txt_u->las_d = txt_u->hed_u.fir_d; @@ -873,6 +1033,7 @@ u3_book_save(u3_book* txt_u, struct iovec iov_u[max_ded_d * 3]; c3_d now_d = txt_u->off_d; c3_d dun_d = 0; + c3_w chk_w = (c3_w)crc32(0, Z_NULL, 0); while ( dun_d < len_d ) { c3_d cun_d = c3_min(len_d - dun_d, max_ded_d); @@ -890,6 +1051,10 @@ u3_book_save(u3_book* txt_u, iov_u[idx_d + 2].iov_base = &siz_i[src_d]; iov_u[idx_d + 2].iov_len = sizeof(c3_d); + chk_w = (c3_w)crc32(chk_w, (const c3_y*)&siz_i[src_d], sizeof(c3_d)); + chk_w = (c3_w)crc32(chk_w, buf_y, siz_i[src_d]); + chk_w = (c3_w)crc32(chk_w, (const c3_y*)&siz_i[src_d], sizeof(c3_d)); + cun_z += sizeof(c3_d) + siz_i[src_d] + sizeof(c3_d); } @@ -907,6 +1072,8 @@ u3_book_save(u3_book* txt_u, c3_d new_las_d = eve_d + len_d - 1; txt_u->hed_u.las_d = new_las_d; + txt_u->hed_u.sum_w = chk_w; + txt_u->hed_u.bat_w = len_d; // commit header: write to inactive slot, fsync, swap active if ( c3n == _book_save_head(txt_u) ) { diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h index 5f91fa00ad..6c6b904e12 100644 --- a/pkg/vere/db/book.h +++ b/pkg/vere/db/book.h @@ -7,17 +7,19 @@ /* book: mostly append-only event log ** - ** uses double-buffered headers for single-fsync commits (like LMDB). - ** two header slots alternate; the one with higher valid seq_d is current. + ** uses double-buffered headers for single-fsync commits (like LMDB) + ** two header slots alternate; the one with higher valid seq_d is current */ - /* u3_book_head: on-disk file header (32 bytes, page-aligned slots) + /* u3_book_head: on-disk file header (48 bytes, page-aligned slots) ** - ** fir_d is write-once (set on first event save). - ** las_d is updated after each batch of events is committed. - ** seq_d is monotonically increasing; determines which slot is current. - ** crc_w is CRC32 of preceding fields to detect partial writes. + ** fir_d is write-once (set on first event save) + ** las_d is updated after each batch of events is committed + ** seq_d is monotonically increasing; determines which slot is current + ** bat_w is the number of deeds in the latest batch written + ** sum_w is CRC32 of the latest batch of deeds (for integrity check) + ** crc_w is CRC32 of preceding fields to detect partial writes ** - ** two header slots at offsets 0 and 4096; deeds start at 8192. + ** two header slots at offsets 0 and 4096; deeds start at 8192 */ typedef struct _u3_book_head { c3_w mag_w; // magic number: 0x424f4f4b ("BOOK") @@ -25,6 +27,8 @@ c3_d fir_d; // first event number in file c3_d las_d; // last event number (commit marker) c3_d seq_d; // sequence number (for double-buffer) + c3_d bat_w; // number of deeds in latest batch + c3_w sum_w; // CRC32 of latest deed batch data c3_w crc_w; // CRC32 checksum (of preceding fields) } u3_book_head;