diff --git a/.github/workflows/shared.yml b/.github/workflows/shared.yml index b0b8b6342b..8d7e757e5b 100644 --- a/.github/workflows/shared.yml +++ b/.github/workflows/shared.yml @@ -100,6 +100,7 @@ jobs: pact-test equality-test \ boot-test newt-test \ vere-noun-test unix-test \ + book-test \ benchmarks \ -Doptimize=ReleaseFast \ -Dpace=${{inputs.pace}} \ diff --git a/build.zig b/build.zig index d48fc0e168..1c20c2c898 100644 --- a/build.zig +++ b/build.zig @@ -719,6 +719,16 @@ fn buildBinary( .file = "pkg/vere/ames_tests.c", .deps = vere_test_deps, }, + .{ + .name = "book-test", + .file = "pkg/vere/book_tests.c", + .deps = vere_test_deps, + }, + .{ + .name = "lmdb-test", + .file = "pkg/vere/lmdb_tests.c", + .deps = vere_test_deps, + }, .{ .name = "boot-test", .file = "pkg/vere/boot_tests.c", diff --git a/pkg/c3/platform/windows/compat.c b/pkg/c3/platform/windows/compat.c index d47f3f8f28..795d09d3e1 100644 --- a/pkg/c3/platform/windows/compat.c +++ b/pkg/c3/platform/windows/compat.c @@ -5,6 +5,16 @@ #include #include "errno.h" +static void +ov_from_off_t(OVERLAPPED* ov, off_t offset) +{ + ov->OffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((offset >> 32) & 0xFFFFFFFFL); + + ov->Offset = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)offset : (DWORD)(offset & 0xFFFFFFFFL); +} + // set default CRT file mode to binary // note that mingw binmode.o does nothing #undef _fmode @@ -560,10 +570,7 @@ ssize_t pread(int fd, void *buf, size_t count, off_t offset) OVERLAPPED overlapped = {0}; - overlapped.OffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)0 : (DWORD)((offset >> 32) & 0xFFFFFFFFL); - overlapped.Offset = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)offset : (DWORD)(offset & 0xFFFFFFFFL); + ov_from_off_t(&overlapped, offset); HANDLE h = (HANDLE)_get_osfhandle(fd); @@ -590,10 +597,7 @@ ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) OVERLAPPED overlapped = {0}; - overlapped.OffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)0 : (DWORD)((offset >> 32) & 0xFFFFFFFFL); - overlapped.Offset = (sizeof(off_t) <= sizeof(DWORD)) ? - (DWORD)offset : (DWORD)(offset & 0xFFFFFFFFL); + ov_from_off_t(&overlapped, offset); HANDLE h = (HANDLE)_get_osfhandle(fd); @@ -609,3 +613,34 @@ ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) return (ssize_t)len; } + +ssize_t pwritev(int fd, const struct iovec* iov, size_t iovcnt, off_t offset) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + + if ( INVALID_HANDLE_VALUE == h ) { + errno = EBADF; + return -1; + } + + DWORD written; + ssize_t len = 0; + OVERLAPPED ov = {0}; + + for (size_t i = 0; i < iovcnt; i++) { + ov_from_off_t(&ov, offset); + DWORD len_write = (DWORD)iov[i].iov_len; // XX chunk on large writes? + void* buf = iov[i].iov_base; + if ( !WriteFile(h, buf, len_write, &written, &ov) ) { + errno = err_win_to_posix(GetLastError()); + return -1; + } + + len += written; + offset += written; + + if ( written < iov[i].iov_len ) break; + } + + return len; +} \ No newline at end of file diff --git a/pkg/c3/platform/windows/compat.h b/pkg/c3/platform/windows/compat.h index 17338b8e16..94d31063a3 100644 --- a/pkg/c3/platform/windows/compat.h +++ b/pkg/c3/platform/windows/compat.h @@ -15,8 +15,11 @@ void *memmem(const void *h0, size_t k, const void *n0, size_t l); uint32_t getppid(); +struct iovec { void *iov_base; size_t iov_len; }; + ssize_t pread(int fd, void *buf, size_t count, off_t offset); ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); +ssize_t pwritev(int fd, const struct iovec* iov, size_t iovcnt, off_t offset); #define SIGUSR1 10 #define SIGALRM 14 diff --git a/pkg/noun/version.h b/pkg/noun/version.h index 88c6ac400a..b19f7289a7 100644 --- a/pkg/noun/version.h +++ b/pkg/noun/version.h @@ -38,6 +38,7 @@ typedef c3_w u3e_version; */ #define U3E_VER1 1 // north+south.bin #define U3E_VER2 2 // image.bin -#define U3E_VERLAT U3E_VER2 +#define U3E_VER3 3 // book.log +#define U3E_VERLAT U3E_VER3 #endif /* ifndef U3_VERSION_H */ diff --git a/pkg/vere/book_tests.c b/pkg/vere/book_tests.c new file mode 100644 index 0000000000..741660211b --- /dev/null +++ b/pkg/vere/book_tests.c @@ -0,0 +1,1791 @@ +#include "db/book.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define _alloc(sz) malloc(sz) +#define _free(ptr) free(ptr) + +// book format v2: header area size (two 4096-byte header slots) +#define BOOK_DEED_BASE 8192 + +/* _test_make_tmpdir(): create unique temporary directory with epoch subdir. +** +** creates /tmp/book_test_XXXXXX/0i0 and returns the epoch path. +** returns: heap-allocated path (caller must free) +*/ +static c3_c* +_test_make_tmpdir(void) +{ + c3_c pat_c[] = "/tmp/book_test_XXXXXX"; + c3_c* dir_c = mkdtemp(pat_c); + + if ( !dir_c ) { + fprintf(stderr, "book_test: mkdtemp failed: %s\r\n", strerror(errno)); + return 0; + } + + // create epoch subdirectory 0i0 + c3_c epo_c[256]; + snprintf(epo_c, sizeof(epo_c), "%s/0i0", dir_c); + if ( -1 == mkdir(epo_c, 0755) ) { + fprintf(stderr, "book_test: mkdir failed: %s\r\n", strerror(errno)); + return 0; + } + + c3_c* ret_c = _alloc(strlen(epo_c) + 1); + strcpy(ret_c, epo_c); + return ret_c; +} + +/* _test_rm_rf(): recursively remove directory contents. +** +** expects epoch path like /tmp/book_test_XXXXXX/0i0 +** removes the parent directory (the whole test dir) +*/ +static void +_test_rm_rf(const c3_c* pax_c) +{ + if ( !pax_c || strncmp(pax_c, "/tmp", 4) != 0 ) { + fprintf(stderr, "book_test: refusing to remove non-/tmp path: %s\r\n", pax_c); + exit(1); + } + + // strip epoch suffix to get parent tmpdir + c3_c* par_c = strdup(pax_c); + c3_c* las_c = strrchr(par_c, '/'); + if ( las_c ) *las_c = '\0'; + + c3_c cmd_c[8192]; + snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", par_c); + system(cmd_c); + free(par_c); +} + +/* _test_make_event(): create a test event buffer (mug + jam). +** +** creates a buffer with 4-byte mug followed by jam data. +** jam data is just the event number repeated. +** +** returns: heap-allocated buffer (caller must free) +*/ +static c3_y* +_test_make_event(c3_z* len_z, c3_d eve_d) +{ + // create simple jam data: 8 bytes containing the event number + c3_z jam_z = 8; + c3_z tot_z = 4 + jam_z; // mug + jam + c3_y* buf_y = _alloc(tot_z); + + // mug: use event number as simple hash + c3_w mug_w = (c3_w)(eve_d * 0x12345678); + memcpy(buf_y, &mug_w, 4); + + // jam: event number as 8 bytes + memcpy(buf_y + 4, &eve_d, 8); + + *len_z = tot_z; + return buf_y; +} + +/* _test_truncate_file(): truncate file to given size. +*/ +static c3_o +_test_truncate_file(const c3_c* pax_c, c3_d siz_d) +{ + if ( -1 == truncate(pax_c, siz_d) ) { + return c3n; + } + return c3y; +} + +/* _test_write_raw(): write raw bytes at offset in file. +*/ +static c3_o +_test_write_raw(const c3_c* pax_c, c3_d off_d, void* dat_v, c3_z len_z) +{ + c3_i fid_i = open(pax_c, O_RDWR); + if ( fid_i < 0 ) { + return c3n; + } + + c3_zs ret = pwrite(fid_i, dat_v, len_z, off_d); + close(fid_i); + + return (ret == (c3_zs)len_z) ? c3y : c3n; +} + +/* _test_file_size(): get file size. +*/ +static c3_d +_test_file_size(const c3_c* pax_c) +{ + struct stat buf_u; + if ( -1 == stat(pax_c, &buf_u) ) { + return 0; + } + return (c3_d)buf_u.st_size; +} + +/* _test_read_cb(): callback for u3_book_read that stores event data. +*/ +typedef struct { + c3_d eve_d; + c3_z len_z; + c3_y* buf_y; + c3_o called; +} _test_read_ctx; + +static c3_o +_test_read_cb(void* ptr_v, c3_d eve_d, c3_z len_z, void* buf_v) +{ + _test_read_ctx* ctx_u = ptr_v; + ctx_u->eve_d = eve_d; + ctx_u->len_z = len_z; + ctx_u->buf_y = _alloc(len_z); + ctx_u->called = c3y; + memcpy(ctx_u->buf_y, buf_v, len_z); + return c3y; +} + +/* _test_meta_cb(): callback for u3_book_read_meta. +*/ +typedef struct { + c3_zs siz_zs; + c3_y buf_y[256]; +} _test_meta_ctx; + +static void +_test_meta_cb(void* ptr_v, c3_zs siz_zs, void* dat_v) +{ + _test_meta_ctx* ctx_u = ptr_v; + ctx_u->siz_zs = siz_zs; + if ( siz_zs > 0 && dat_v ) { + memcpy(ctx_u->buf_y, dat_v, (c3_z)siz_zs); + } +} + +//============================================================================== +// Boundary Condition Tests +//============================================================================== + +/* _test_empty_log_operations(): test operations on empty log. +*/ +static c3_i +_test_empty_log_operations(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " empty_log: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // test gulf on empty log + { + c3_d low_d, hig_d; + c3_o gul_o = u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( c3y != gul_o ) { + fprintf(stderr, " empty_log: gulf returned c3n\r\n"); + ret_i = 0; + } + // empty log should have fir_d=0, las_d=0 + if ( 0 != low_d || 0 != hig_d ) { + fprintf(stderr, " empty_log: gulf expected (0,0), got (%" PRIu64 ",%" PRIu64 ")\r\n", + low_d, hig_d); + ret_i = 0; + } + } + + // test read on empty log - should fail + { + _test_read_ctx ctx_u = {0}; + c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); + + if ( c3n != red_o ) { + fprintf(stderr, " empty_log: read should fail on empty log\r\n"); + ret_i = 0; + } + } + + // test walk_init on empty log - should fail + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 1, 1); + + if ( c3n != wlk_o ) { + fprintf(stderr, " empty_log: walk_init should fail on empty log\r\n"); + ret_i = 0; + } + } + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " empty_log_operations: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_single_event_lifecycle(): write, read, walk single event. +*/ +static c3_i +_test_single_event_lifecycle(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " single_event: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write single event (event #1, epoch 0) + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " single_event: save failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // verify gulf + // NB: fir_d is the epoch base (0), las_d is the last stored event (1) + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( 0 != low_d || 1 != hig_d ) { + fprintf(stderr, " single_event: gulf expected (0,1), got (%" PRIu64 ",%" PRIu64 ")\r\n", + low_d, hig_d); + ret_i = 0; + } + } + + // read it back + { + _test_read_ctx ctx_u = {0}; + c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); + + if ( c3n == red_o ) { + fprintf(stderr, " single_event: read failed\r\n"); + ret_i = 0; + } + else { + if ( ctx_u.eve_d != 1 ) { + fprintf(stderr, " single_event: read wrong event number\r\n"); + ret_i = 0; + } + if ( ctx_u.len_z != evt_z ) { + fprintf(stderr, " single_event: read wrong length\r\n"); + ret_i = 0; + } + if ( 0 != memcmp(ctx_u.buf_y, evt_y, evt_z) ) { + fprintf(stderr, " single_event: read data mismatch\r\n"); + ret_i = 0; + } + _free(ctx_u.buf_y); + } + } + + // walk it + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 1, 1); + + if ( c3n == wlk_o ) { + fprintf(stderr, " single_event: walk_init failed\r\n"); + ret_i = 0; + } + else { + c3_z len_z; + void* buf_v; + c3_o nex_o = u3_book_walk_next(&itr_u, &len_z, &buf_v); + + if ( c3n == nex_o ) { + fprintf(stderr, " single_event: walk_next failed\r\n"); + ret_i = 0; + } + else { + if ( len_z != evt_z ) { + fprintf(stderr, " single_event: walk wrong length\r\n"); + ret_i = 0; + } + _free(buf_v); + + // second call should return c3n (end of iteration) + nex_o = u3_book_walk_next(&itr_u, &len_z, &buf_v); + if ( c3y == nex_o ) { + fprintf(stderr, " single_event: walk should end after 1 event\r\n"); + ret_i = 0; + _free(buf_v); + } + } + + u3_book_walk_done(&itr_u); + } + } + + u3_book_exit(txt_u); + +cleanup: + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " single_event_lifecycle: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_epoch_boundary_validation(): first event must be epo_d + 1. +*/ +static c3_i +_test_epoch_boundary_validation(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " epoch_boundary: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + evt_y = _test_make_event(&evt_z, 5); + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + // try to write event 5 with epoch 0 - should fail (expects event 1) + { + c3_o sav_o = u3_book_save(txt_u, 5, 1, byt_p, siz_i, 0); + if ( c3y == sav_o ) { + fprintf(stderr, " epoch_boundary: should reject event 5 for epoch 0\r\n"); + ret_i = 0; + } + } + + // write event 5 with epoch 4 - should succeed (4 + 1 = 5) + { + c3_o sav_o = u3_book_save(txt_u, 5, 1, byt_p, siz_i, 4); + if ( c3n == sav_o ) { + fprintf(stderr, " epoch_boundary: should accept event 5 for epoch 4\r\n"); + ret_i = 0; + } + } + + u3_book_exit(txt_u); + +cleanup: + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " epoch_boundary_validation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_contiguity_gap_rejection(): reject non-contiguous events. +*/ +static c3_i +_test_contiguity_gap_rejection(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt1_y = 0; + c3_y* evt3_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " contiguity: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write event 1 + evt1_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt1_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " contiguity: save event 1 failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // try to write event 3 (skipping 2) - should fail + evt3_y = _test_make_event(&evt_z, 3); + { + void* byt_p[1] = { evt3_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 3, 1, byt_p, siz_i, 0); + if ( c3y == sav_o ) { + fprintf(stderr, " contiguity: should reject gap (event 3 after 1)\r\n"); + ret_i = 0; + } + } + + u3_book_exit(txt_u); + +cleanup: + if ( evt1_y ) _free(evt1_y); + if ( evt3_y ) _free(evt3_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " contiguity_gap_rejection: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_minimum_event_size(): event with minimum size (just mug). +*/ +static c3_i +_test_minimum_event_size(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " min_event: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // create minimum event: just 4 bytes (mug only, no jam) + c3_y evt_y[4] = { 0xDE, 0xAD, 0xBE, 0xEF }; + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { 4 }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " min_event: save failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // read it back + { + _test_read_ctx ctx_u = {0}; + c3_o red_o = u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb); + + if ( c3n == red_o ) { + fprintf(stderr, " min_event: read failed\r\n"); + ret_i = 0; + } + else { + if ( ctx_u.len_z != 4 ) { + fprintf(stderr, " min_event: wrong length %" PRIu64 "\r\n", (c3_d)ctx_u.len_z); + ret_i = 0; + } + _free(ctx_u.buf_y); + } + } + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " minimum_event_size: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +//============================================================================== +// Crash Recovery & Corruption Tests +//============================================================================== + +/* _test_truncated_file_recovery(): truncate mid-event, verify recovery. +** +** write two events, truncate file mid-second-event, reopen. +** recovery should find only the first complete event. +*/ +static c3_i +_test_truncated_file_recovery(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt1_y = 0; + c3_y* evt2_y = 0; + c3_z evt_z; + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " truncated_file: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write two events (each evt_z = 12 bytes: 4 mug + 8 jam) + // deed size on disk = 12 (head) + 8 (jam) + 8 (tail) = 28 bytes + evt1_y = _test_make_event(&evt_z, 1); + evt2_y = _test_make_event(&evt_z, 2); + { + void* byt_p[2] = { evt1_y, evt2_y }; + c3_z siz_i[2] = { evt_z, evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 2, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " truncated_file: save failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + u3_book_exit(txt_u); + txt_u = 0; + + // file layout: [header A @ 0] [header B @ 4096] [deed1 @ 8192] [deed2] + // deed size = sizeof(deed_head) + (len_d - 4) + sizeof(deed_tail) + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + c3_d siz_d = _test_file_size(path_c); + + // calculate deed size dynamically: total - headers = 2 deeds + c3_d deed_size = (siz_d - BOOK_DEED_BASE) / 2; + + // truncate to: headers + deed1 + 5 bytes of deed2 + c3_d truncate_at = BOOK_DEED_BASE + deed_size + 5; + + if ( c3n == _test_truncate_file(path_c, truncate_at) ) { + fprintf(stderr, " truncated_file: truncate failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // reopen - recovery should find deed1 valid, deed2 truncated + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " truncated_file: reopen failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // verify only event 1 exists + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( hig_d != 1 ) { + fprintf(stderr, " truncated_file: expected hig=1, got %" PRIu64 "\r\n", hig_d); + ret_i = 0; + } + } + + u3_book_exit(txt_u); + txt_u = 0; + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( evt1_y ) _free(evt1_y); + if ( evt2_y ) _free(evt2_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " truncated_file_recovery: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + + +//============================================================================== +// Iterator Tests +//============================================================================== + +/* _test_walk_single_event(): walk range of exactly 1 event. +*/ +static c3_i +_test_walk_single_event(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y[3] = {0}; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " walk_single: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write 3 events + { + void* byt_p[3]; + c3_z siz_i[3]; + + for ( int i = 0; i < 3; i++ ) { + evt_y[i] = _test_make_event(&evt_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = evt_z; + } + + c3_o sav_o = u3_book_save(txt_u, 1, 3, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " walk_single: save failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // walk just event 2 + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 2, 2); + + if ( c3n == wlk_o ) { + fprintf(stderr, " walk_single: walk_init failed\r\n"); + ret_i = 0; + } + else { + c3_z len_z; + void* buf_v; + c3_i count = 0; + + while ( c3y == u3_book_walk_next(&itr_u, &len_z, &buf_v) ) { + count++; + _free(buf_v); + } + + if ( count != 1 ) { + fprintf(stderr, " walk_single: expected 1 event, got %d\r\n", count); + ret_i = 0; + } + + u3_book_walk_done(&itr_u); + } + } + + u3_book_exit(txt_u); + +cleanup: + for ( int i = 0; i < 3; i++ ) { + if ( evt_y[i] ) _free(evt_y[i]); + } + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " walk_single_event: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_walk_invalidation(): walk_done then walk_next should fail. +*/ +static c3_i +_test_walk_invalidation(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y = 0; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " walk_invalid: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write event + evt_y = _test_make_event(&evt_z, 1); + { + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { evt_z }; + + c3_o sav_o = u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " walk_invalid: save failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // init walk, then done, then try next + { + u3_book_walk itr_u; + c3_o wlk_o = u3_book_walk_init(txt_u, &itr_u, 1, 1); + + if ( c3n == wlk_o ) { + fprintf(stderr, " walk_invalid: walk_init failed\r\n"); + ret_i = 0; + } + else { + u3_book_walk_done(&itr_u); + + c3_z len_z; + void* buf_v; + c3_o nex_o = u3_book_walk_next(&itr_u, &len_z, &buf_v); + + if ( c3y == nex_o ) { + fprintf(stderr, " walk_invalid: walk_next should fail after done\r\n"); + ret_i = 0; + _free(buf_v); + } + } + } + + u3_book_exit(txt_u); + +cleanup: + if ( evt_y ) _free(evt_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " walk_invalidation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_walk_range_validation(): invalid ranges should fail. +*/ +static c3_i +_test_walk_range_validation(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* evt_y[3] = {0}; + c3_z evt_z; + + if ( !txt_u ) { + fprintf(stderr, " walk_range: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // write 3 events + { + void* byt_p[3]; + c3_z siz_i[3]; + + for ( int i = 0; i < 3; i++ ) { + evt_y[i] = _test_make_event(&evt_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = evt_z; + } + + c3_o sav_o = u3_book_save(txt_u, 1, 3, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " walk_range: save failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // try invalid ranges + { + u3_book_walk itr_u; + + // nex > las (inverted range) + if ( c3y == u3_book_walk_init(txt_u, &itr_u, 3, 1) ) { + fprintf(stderr, " walk_range: should reject nex > las\r\n"); + ret_i = 0; + } + + // las beyond log end + if ( c3y == u3_book_walk_init(txt_u, &itr_u, 1, 100) ) { + fprintf(stderr, " walk_range: should reject las > log end\r\n"); + ret_i = 0; + } + + // nex before log start (fir_d is 1) + if ( c3y == u3_book_walk_init(txt_u, &itr_u, 0, 1) ) { + fprintf(stderr, " walk_range: should reject nex < fir_d\r\n"); + ret_i = 0; + } + } + + u3_book_exit(txt_u); + +cleanup: + for ( int i = 0; i < 3; i++ ) { + if ( evt_y[i] ) _free(evt_y[i]); + } + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " walk_range_validation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_invalid_magic(): file with wrong magic number should be rejected. +*/ +static c3_i +_test_invalid_magic(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " invalid_magic: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + u3_book_exit(txt_u); + txt_u = 0; + + // corrupt magic number in BOTH header slots (double-buffered) + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + c3_w bad_magic = 0xDEADBEEF; + if ( c3n == _test_write_raw(path_c, 0, &bad_magic, sizeof(bad_magic)) ) { + fprintf(stderr, " invalid_magic: write_raw A failed\r\n"); + ret_i = 0; + goto cleanup; + } + if ( c3n == _test_write_raw(path_c, 4096, &bad_magic, sizeof(bad_magic)) ) { + fprintf(stderr, " invalid_magic: write_raw B failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // reopen should fail + txt_u = u3_book_init(dir_c); + if ( txt_u ) { + fprintf(stderr, " invalid_magic: should reject bad magic\r\n"); + ret_i = 0; + u3_book_exit(txt_u); + txt_u = 0; + } + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " invalid_magic: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_invalid_version(): file with wrong version should be rejected. +*/ +static c3_i +_test_invalid_version(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " invalid_version: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + u3_book_exit(txt_u); + txt_u = 0; + + // corrupt version in BOTH header slots (double-buffered) + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + c3_w bad_version = 99; + if ( c3n == _test_write_raw(path_c, 4, &bad_version, sizeof(bad_version)) ) { + fprintf(stderr, " invalid_version: write_raw A failed\r\n"); + ret_i = 0; + goto cleanup; + } + if ( c3n == _test_write_raw(path_c, 4096 + 4, &bad_version, sizeof(bad_version)) ) { + fprintf(stderr, " invalid_version: write_raw B failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // reopen should fail + txt_u = u3_book_init(dir_c); + if ( txt_u ) { + fprintf(stderr, " invalid_version: should reject bad version\r\n"); + ret_i = 0; + u3_book_exit(txt_u); + txt_u = 0; + } + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " invalid_version: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_undersized_file(): file smaller than header should be rejected. +*/ +static c3_i +_test_undersized_file(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_c path_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " undersized: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + u3_book_exit(txt_u); + txt_u = 0; + + // truncate to 8 bytes (less than 8192-byte header area) + snprintf(path_c, sizeof(path_c), "%s/book.log", dir_c); + if ( c3n == _test_truncate_file(path_c, 8) ) { + fprintf(stderr, " undersized: truncate failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // reopen should fail + txt_u = u3_book_init(dir_c); + if ( txt_u ) { + fprintf(stderr, " undersized: should reject undersized file\r\n"); + ret_i = 0; + u3_book_exit(txt_u); + txt_u = 0; + } + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " undersized_file: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_metadata_roundtrip(): save and read all metadata fields. +*/ +static c3_i +_test_metadata_roundtrip(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " meta_roundtrip: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // test "version" field + { + c3_w ver_w = 42; + c3_o sav_o = u3_book_save_meta(txt_u, "version", sizeof(c3_w), &ver_w); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save version failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "version", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(c3_w) ) { + fprintf(stderr, " meta_roundtrip: read version wrong size\r\n"); + ret_i = 0; + } + else { + c3_w got_w; + memcpy(&got_w, ctx_u.buf_y, sizeof(c3_w)); + if ( got_w != 42 ) { + fprintf(stderr, " meta_roundtrip: version mismatch\r\n"); + ret_i = 0; + } + } + } + } + + // test "who" field (16 bytes) + { + c3_d who_d[2] = { 0x123456789ABCDEF0, 0xFEDCBA9876543210 }; + c3_o sav_o = u3_book_save_meta(txt_u, "who", sizeof(who_d), who_d); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save who failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "who", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(who_d) ) { + fprintf(stderr, " meta_roundtrip: read who wrong size\r\n"); + ret_i = 0; + } + else { + c3_d got_d[2]; + memcpy(got_d, ctx_u.buf_y, sizeof(got_d)); + if ( got_d[0] != who_d[0] || got_d[1] != who_d[1] ) { + fprintf(stderr, " meta_roundtrip: who mismatch\r\n"); + ret_i = 0; + } + } + } + } + + // test "fake" field (1 byte) + { + c3_o fak_o = c3y; + c3_o sav_o = u3_book_save_meta(txt_u, "fake", sizeof(c3_o), &fak_o); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save fake failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "fake", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(c3_o) ) { + fprintf(stderr, " meta_roundtrip: read fake wrong size\r\n"); + ret_i = 0; + } + else { + c3_o got_o; + memcpy(&got_o, ctx_u.buf_y, sizeof(c3_o)); + if ( got_o != c3y ) { + fprintf(stderr, " meta_roundtrip: fake mismatch\r\n"); + ret_i = 0; + } + } + } + } + + // test "life" field + { + c3_w lif_w = 1234; + c3_o sav_o = u3_book_save_meta(txt_u, "life", sizeof(c3_w), &lif_w); + if ( c3n == sav_o ) { + fprintf(stderr, " meta_roundtrip: save life failed\r\n"); + ret_i = 0; + } + else { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "life", _test_meta_cb); + if ( ctx_u.siz_zs != sizeof(c3_w) ) { + fprintf(stderr, " meta_roundtrip: read life wrong size\r\n"); + ret_i = 0; + } + else { + c3_w got_w; + memcpy(&got_w, ctx_u.buf_y, sizeof(c3_w)); + if ( got_w != 1234 ) { + fprintf(stderr, " meta_roundtrip: life mismatch\r\n"); + ret_i = 0; + } + } + } + } + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " metadata_roundtrip: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_metadata_invalid_key(): unknown key should return -1. +*/ +static c3_i +_test_metadata_invalid_key(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " meta_invalid: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // read unknown key + { + _test_meta_ctx ctx_u = {0}; + u3_book_read_meta(txt_u, &ctx_u, "nonexistent", _test_meta_cb); + if ( ctx_u.siz_zs != -1 ) { + fprintf(stderr, " meta_invalid: should return -1 for unknown key\r\n"); + ret_i = 0; + } + } + + // write unknown key + { + c3_w val_w = 42; + c3_o sav_o = u3_book_save_meta(txt_u, "nonexistent", sizeof(val_w), &val_w); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_invalid: should reject unknown key\r\n"); + ret_i = 0; + } + } + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " metadata_invalid_key: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_metadata_size_validation(): wrong-sized values should be rejected. +*/ +static c3_i +_test_metadata_size_validation(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " meta_size: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // try to write 2 bytes to "version" (expects 4) + { + c3_y buf_y[2] = { 0x12, 0x34 }; + c3_o sav_o = u3_book_save_meta(txt_u, "version", 2, buf_y); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_size: should reject wrong size for version\r\n"); + ret_i = 0; + } + } + + // try to write 4 bytes to "who" (expects 16) + { + c3_w val_w = 42; + c3_o sav_o = u3_book_save_meta(txt_u, "who", sizeof(val_w), &val_w); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_size: should reject wrong size for who\r\n"); + ret_i = 0; + } + } + + // try to write 4 bytes to "fake" (expects 1) + { + c3_w val_w = 1; + c3_o sav_o = u3_book_save_meta(txt_u, "fake", sizeof(val_w), &val_w); + if ( c3y == sav_o ) { + fprintf(stderr, " meta_size: should reject wrong size for fake\r\n"); + ret_i = 0; + } + } + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " metadata_size_validation: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _test_partial_batch_recovery(): simulate power failure where header is +** flushed but deed data is corrupt. +** +** writes two batches: event 1 (batch 1), events 2-3 (batch 2). +** corrupts deed 3's buffer data while keeping its len_d/let_d framing +** intact. on reopen, the batch 2 checksum should fail and recovery +** should roll back batch 2, leaving only event 1. +*/ +static c3_i +_test_partial_batch_recovery(void) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + c3_y* ev1_y = 0; + c3_y* ev2_y = 0; + c3_y* ev3_y = 0; + c3_z siz_z; + c3_c pax_c[8192]; + + if ( !txt_u ) { + fprintf(stderr, " partial_batch: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // batch 1: write event 1 + ev1_y = _test_make_event(&siz_z, 1); + { + void* byt_p[1] = { ev1_y }; + c3_z siz_i[1] = { siz_z }; + + if ( c3n == u3_book_save(txt_u, 1, 1, byt_p, siz_i, 0) ) { + fprintf(stderr, " partial_batch: save batch 1 failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // batch 2: write events 2-3 + ev2_y = _test_make_event(&siz_z, 2); + ev3_y = _test_make_event(&siz_z, 3); + { + void* byt_p[2] = { ev2_y, ev3_y }; + c3_z siz_i[2] = { siz_z, siz_z }; + + if ( c3n == u3_book_save(txt_u, 2, 2, byt_p, siz_i, 0) ) { + fprintf(stderr, " partial_batch: save batch 2 failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + u3_book_exit(txt_u); + txt_u = 0; + + // corrupt deed 3's buffer data while keeping framing intact + // + // each deed: len_d (8) + buffer (siz_z) + let_d (8) + // + snprintf(pax_c, sizeof(pax_c), "%s/book.log", dir_c); + { + c3_d ded_d = 8 + siz_z + 8; + c3_d dee_d = BOOK_DEED_BASE + (ded_d * 2); + c3_d buf_d = dee_d + 8; + + c3_y jnk_y[64]; + memset(jnk_y, 0xFF, sizeof(jnk_y)); + if ( c3n == _test_write_raw(pax_c, buf_d, jnk_y, siz_z) ) { + fprintf(stderr, " partial_batch: corrupt failed\r\n"); + ret_i = 0; + goto cleanup; + } + } + + // reopen — batch 2 checksum should fail, recovery rolls back batch 2 + txt_u = u3_book_init(dir_c); + if ( !txt_u ) { + fprintf(stderr, " partial_batch: reopen failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // verify only event 1 remains (batch 2 rolled back) + { + c3_d low_d, hig_d; + u3_book_gulf(txt_u, &low_d, &hig_d); + + if ( hig_d != 1 ) { + fprintf(stderr, " partial_batch: expected hig=1, got %" PRIu64 "\r\n", hig_d); + ret_i = 0; + } + } + + // verify event 1 is still readable and correct + { + _test_read_ctx ctx_u = {0}; + + if ( c3n == u3_book_read(txt_u, &ctx_u, 1, 1, _test_read_cb) ) { + fprintf(stderr, " partial_batch: read event 1 failed\r\n"); + ret_i = 0; + } + else { + if ( ctx_u.len_z != siz_z || + 0 != memcmp(ctx_u.buf_y, ev1_y, siz_z) ) + { + fprintf(stderr, " partial_batch: event 1 data mismatch\r\n"); + ret_i = 0; + } + _free(ctx_u.buf_y); + } + } + + u3_book_exit(txt_u); + txt_u = 0; + +cleanup: + if ( txt_u ) u3_book_exit(txt_u); + if ( ev1_y ) _free(ev1_y); + if ( ev2_y ) _free(ev2_y); + if ( ev3_y ) _free(ev3_y); + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " partial_batch_recovery: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +//============================================================================== +// Benchmarks +//============================================================================== + +/* _bench_make_event(): create a dummy event of specified size. +** +** creates a buffer with 4-byte mug followed by dummy data. +** the data is filled with a pattern based on the event number. +** +** returns: heap-allocated buffer (caller must free) +*/ +static c3_y* +_bench_make_event(c3_z siz_z, c3_d eve_d) +{ + c3_y* buf_y = _alloc(siz_z); + + // mug: simple hash from event number + c3_w mug_w = (c3_w)(eve_d * 0x12345678); + memcpy(buf_y, &mug_w, 4); + + // fill remaining bytes with pattern + for ( c3_z i = 4; i < siz_z; i++ ) { + buf_y[i] = (c3_y)((eve_d + i) & 0xFF); + } + + return buf_y; +} + +/* _bench_get_time_ns(): get current time in nanoseconds. +*/ +static c3_d +_bench_get_time_ns(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (c3_d)ts.tv_sec * 1000000000ULL + (c3_d)ts.tv_nsec; +} + +/* _bench_write_speed(): benchmark write performance. +** +** writes [num_d] events of [siz_z] bytes each, one at a time. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed(c3_d num_d, c3_z siz_z) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " write_speed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffer (reuse for all writes) + c3_y* evt_y = _bench_make_event(siz_z, 1); + + // start timing + c3_d beg_d = _bench_get_time_ns(); + + // write events one at a time + for ( c3_d i = 0; i < num_d; i++ ) { + // update event data pattern for variety + c3_w mug_w = (c3_w)((i + 1) * 0x12345678); + memcpy(evt_y, &mug_w, 4); + + void* byt_p[1] = { evt_y }; + c3_z siz_i[1] = { siz_z }; + + c3_o sav_o = u3_book_save(txt_u, i + 1, 1, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed: save failed at event %" PRIu64 "\r\n", i + 1); + ret_i = 0; + _free(evt_y); + goto cleanup; + } + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; // elapsed nanoseconds + + // calculate metrics + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (single-event writes):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + + _free(evt_y); + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _bench_write_speed_batched(): benchmark batched write performance. +** +** writes [num_d] events of [siz_z] bytes in batches of [bat_d]. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed_batched(c3_d num_d, c3_z siz_z, c3_d bat_d) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " write_speed_batched: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // allocate batch arrays + c3_y** evt_y = _alloc(bat_d * sizeof(c3_y*)); + void** byt_p = _alloc(bat_d * sizeof(void*)); + c3_z* siz_i = _alloc(bat_d * sizeof(c3_z)); + + // pre-allocate event buffers for batch + for ( c3_d i = 0; i < bat_d; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // start timing + c3_d start_d = _bench_get_time_ns(); + + // write events in batches + c3_d wit_d = 0; // counter + while ( wit_d < num_d ) { + c3_d remaining = num_d - wit_d; + c3_d batch_size = (remaining < bat_d) ? remaining : bat_d; + + // update event data patterns + for ( c3_d i = 0; i < batch_size; i++ ) { + c3_w mug_w = (c3_w)((wit_d + i + 1) * 0x12345678); + memcpy(evt_y[i], &mug_w, 4); + } + + c3_o sav_o = u3_book_save(txt_u, wit_d + 1, batch_size, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_batched: save failed at event %" PRIu64 "\r\n", + wit_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + wit_d += batch_size; + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - start_d; // nanoseconds + + // calculate metrics + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (batched writes, batch=%" PRIu64 "):\r\n", bat_d); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < bat_d; i++ ) { + _free(evt_y[i]); + } + _free(evt_y); + _free(byt_p); + _free(siz_i); + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_batched_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _bench_write_speed_mixed(): benchmark mixed batch-size write performance. +** +** writes [num_d] events of [siz_z] bytes using a realistic distribution +** of batch sizes (1-9), interleaved via deterministic PRNG. +** reports total time, events/sec, MB/s, per-event latency, and save calls. +*/ +static c3_i +_bench_write_speed_mixed(c3_d num_d, c3_z siz_z) +{ + // batch size distribution from production telemetry + // + static const c3_d bat_d[9] = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + static const c3_d cnt_d[9] = { + 2128433, 407761, 234541, 89359, 41390, 21376, 10945, 5399, 5466 + }; + + // compute original total events for scaling + // + c3_d ori_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + ori_d += bat_d[i] * cnt_d[i]; + } + + // scale counts proportionally to num_d + // + c3_d rem_d[9]; + c3_d tot_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + rem_d[i] = (cnt_d[i] * num_d) / ori_d; + if ( (0 == rem_d[i]) && (cnt_d[i] > 0) ) { + rem_d[i] = 1; + } + tot_d += rem_d[i]; + } + + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + u3_book* txt_u = u3_book_init(dir_c); + + if ( !txt_u ) { + fprintf(stderr, " write_speed_mixed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffers for max batch size (9) + // + c3_y* evt_y[9]; + void* byt_p[9]; + c3_z siz_i[9]; + + for ( c3_d i = 0; i < 9; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // deterministic xorshift32 PRNG + // + c3_w rng_w = 12345; + + c3_d wit_d = 0; // events written + c3_d cal_d = 0; // save calls made + + // start timing + // + c3_d beg_d = _bench_get_time_ns(); + + while ( tot_d > 0 ) { + // xorshift32 step + // + rng_w ^= rng_w << 13; + rng_w ^= rng_w >> 17; + rng_w ^= rng_w << 5; + + // weighted selection from remaining counts + // + c3_d pick = (c3_d)rng_w % tot_d; + c3_d acc = 0; + c3_d idx = 0; + + for ( idx = 0; idx < 9; idx++ ) { + acc += rem_d[idx]; + if ( pick < acc ) break; + } + + c3_d bsz = bat_d[idx]; + rem_d[idx]--; + tot_d--; + + // update mug patterns in event buffers + // + for ( c3_d j = 0; j < bsz; j++ ) { + c3_w mug_w = (c3_w)((wit_d + j + 1) * 0x12345678); + memcpy(evt_y[j], &mug_w, 4); + } + + c3_o sav_o = u3_book_save(txt_u, wit_d + 1, bsz, byt_p, siz_i, 0); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_mixed: save failed at event %" PRIu64 "\r\n", + wit_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + wit_d += bsz; + cal_d++; + } + + // end timing + // + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; + + // calculate metrics + // + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)wit_d / elapsed_sec; + double total_bytes = (double)wit_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)wit_d; + + // report results + // + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (mixed batch sizes 1-9):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", wit_d); + fprintf(stderr, " save calls: %" PRIu64 "\r\n", cal_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < 9; i++ ) { + _free(evt_y[i]); + } + + u3_book_exit(txt_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_mixed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +//============================================================================== +// Main +//============================================================================== + +int +main(int argc, char* argv[]) +{ + c3_i ret_i = 1; + + // boundary tests + ret_i &= _test_empty_log_operations(); + ret_i &= _test_single_event_lifecycle(); + ret_i &= _test_epoch_boundary_validation(); + ret_i &= _test_contiguity_gap_rejection(); + ret_i &= _test_minimum_event_size(); + + // crash recovery tests + ret_i &= _test_truncated_file_recovery(); + ret_i &= _test_partial_batch_recovery(); + + // iterator tests + ret_i &= _test_walk_single_event(); + ret_i &= _test_walk_invalidation(); + ret_i &= _test_walk_range_validation(); + + // header & format tests + ret_i &= _test_invalid_magic(); + ret_i &= _test_invalid_version(); + ret_i &= _test_undersized_file(); + + // metadata tests + ret_i &= _test_metadata_roundtrip(); + ret_i &= _test_metadata_invalid_key(); + ret_i &= _test_metadata_size_validation(); + + // benchmarks + // ret_i &= _bench_write_speed(1000, 128); + // ret_i &= _bench_write_speed_batched(100000, 1280, 1000); + // ret_i &= _bench_write_speed_mixed(10000, 128); + + fprintf(stderr, "\r\n"); + if ( ret_i ) { + fprintf(stderr, "book_tests: ok\n"); + return 0; + } + else { + fprintf(stderr, "book_tests: failed\n"); + return 1; + } +} diff --git a/pkg/vere/build.zig b/pkg/vere/build.zig index 5f4429fd0c..acd18963ea 100644 --- a/pkg/vere/build.zig +++ b/pkg/vere/build.zig @@ -216,6 +216,7 @@ const c_source_files = [_][]const u8{ "ca_bundle/ca_bundle.c", "dawn.c", "db/lmdb.c", + "db/book.c", "disk.c", "foil.c", "io/ames.c", @@ -246,6 +247,7 @@ const c_source_files = [_][]const u8{ const install_headers = [_][]const u8{ "db/lmdb.h", + "db/book.h", "dns_sd.h", "io/ames/stun.h", "io/lss.h", diff --git a/pkg/vere/db/book.c b/pkg/vere/db/book.c new file mode 100644 index 0000000000..c1055ec61f --- /dev/null +++ b/pkg/vere/db/book.c @@ -0,0 +1,1356 @@ +/// @file + +#include "db/book.h" + +#include +#ifndef U3_OS_windows +# include +#endif +#include +#include +#include +#include +#include +#include + +#include "c3/c3.h" +#include "noun.h" +#include "ship.h" + +// book: append-only event log +// +// simple file-based persistence layer for urbit's event log. +// optimized for sequential writes and reads; no random access. +// +// file format: +// [24-byte header] +// [events: len_d | buffer_data | let_d] +// +// metadata stored in separate meta.bin file +// + + #define BOOK_MAGIC 0x424f4f4b // "BOOK" + #define BOOK_VERSION 1 // format version + + // header slot offsets (page-aligned for atomic writes) + #define BOOK_HEAD_A 0 // first header slot + #define BOOK_HEAD_B 4096 // second header slot + #define BOOK_DEED_BASE 8192 // deeds start here + +/* _book_head_crc(): compute CRC32 of header fields. +*/ +static c3_l +_book_head_crc(const u3_book_head* hed_u) +{ + c3_z len_z = offsetof(u3_book_head, crc_w); + return (c3_l)crc32(0, (const c3_y*)hed_u, len_z); +} + +/* _book_head_okay(): validate header magic, version, and checksum. +*/ +static c3_o +_book_head_okay(const u3_book_head* hed_u) +{ + if ( BOOK_MAGIC != hed_u->mag_w ) { + return c3n; + } + + if ( BOOK_VERSION != hed_u->ver_w ) { + return c3n; + } + + c3_w crc_w = _book_head_crc(hed_u); + if ( crc_w != hed_u->crc_w ) { + return c3n; + } + + return c3y; +} + +/* _book_save_head(): write header to inactive slot, sync, swap active. +** +** caller must set hed_u fields (e.g. las_d) before calling. +** increments seq_d, recomputes crc_w, and swaps active slot. +*/ +static c3_o +_book_save_head(u3_book* txt_u) +{ + txt_u->hed_u.seq_d++; + txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); + + c3_d slot_d = (txt_u->act_w == 0) ? BOOK_HEAD_B : BOOK_HEAD_A; + if ( sizeof(u3_book_head) != pwrite(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), slot_d) ) + { + fprintf(stderr, "book: failed to write header: %s\r\n", strerror(errno)); + return c3n; + } + + if ( -1 == c3_sync(txt_u->fid_i) ) { + fprintf(stderr, "book: failed to sync: %s\r\n", strerror(errno)); + return c3n; + } + + txt_u->act_w = (txt_u->act_w == 0) ? 1 : 0; + return c3y; +} + +/* _book_meta_path(): construct path to metadata file. +** +** NB: caller must free the result. +*/ +static c3_c* +_book_meta_path(const c3_c* pax_c) +{ + c3_c* met_c = c3_malloc(strlen(pax_c) + 16); + + if ( !met_c ) { + return 0; + } + + snprintf(met_c, strlen(pax_c) + 16, "%s/meta.bin", pax_c); + return met_c; +} + +/* _book_init_meta_file(): open or create metadata file. +*/ +static c3_i +_book_init_meta_file(const c3_c* pax_c) +{ + c3_c* met_c = _book_meta_path(pax_c); + c3_i met_i = c3_open(met_c, O_RDWR | O_CREAT, 0644); + + if ( 0 > met_i ) { + c3_free(met_c); + return -1; + } + + struct stat buf_u; + if ( 0 > fstat(met_i, &buf_u) ) { + goto fail; + } + + if ( 0 == buf_u.st_size ) { + u3_book_meta met_u; + memset(&met_u, 0, sizeof(u3_book_meta)); + + if ( sizeof(u3_book_meta) != pwrite(met_i, &met_u, sizeof(u3_book_meta), 0) ) { + goto fail; + } + + if ( -1 == c3_sync(met_i) ) { + goto fail; + } + } + + c3_free(met_c); + return met_i; + +fail: + close(met_i); + c3_free(met_c); + return -1; +} + +/* _book_read_meta_file(): read metadata from disk. +*/ +static c3_o +_book_read_meta_file(c3_i met_i, u3_book_meta* met_u) +{ + if ( 0 > met_i ) { + return c3n; + } + + c3_zs ret_zs = pread(met_i, met_u, sizeof(u3_book_meta), 0); + if ( ret_zs != sizeof(u3_book_meta) ) { + return c3n; + } + + return c3y; +} + +/* _book_save_meta_file(): write metadata to disk. +*/ +static c3_o +_book_save_meta_file(c3_i met_i, const u3_book_meta* met_u) +{ + if ( 0 > met_i ) { + return c3n; + } + + c3_zs ret_zs = pwrite(met_i, met_u, sizeof(u3_book_meta), 0); + if ( ret_zs != sizeof(u3_book_meta) ) { + return c3n; + } + + if ( -1 == c3_sync(met_i) ) { + return c3n; + } + + return c3y; +} + +/* _book_make_head(): initialize and write both header slots for new file. +** +** caller should set fir_d and las_d on txt_u->hed_u before calling +** (e.g. to epoch base for non-zero epochs, or 0 for fresh logs). +** both header slots are initialized identically with seq_d = 0. +*/ +static c3_o +_book_make_head(u3_book* txt_u) +{ + c3_zs ret_zs; + c3_d fir_d = txt_u->hed_u.fir_d; + c3_d las_d = txt_u->hed_u.las_d; + + memset(&txt_u->hed_u, 0, sizeof(u3_book_head)); + txt_u->hed_u.mag_w = BOOK_MAGIC; + txt_u->hed_u.ver_w = BOOK_VERSION; + txt_u->hed_u.fir_d = fir_d; + txt_u->hed_u.las_d = las_d; + txt_u->hed_u.seq_d = 0; + txt_u->hed_u.crc_w = _book_head_crc(&txt_u->hed_u); + + ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), BOOK_HEAD_A); + + if ( ret_zs != sizeof(u3_book_head) ) { + u3l_log("book: failed to write header A: %s\r\n", + strerror(errno)); + return c3n; + } + + ret_zs = pwrite(txt_u->fid_i, &txt_u->hed_u, + sizeof(u3_book_head), BOOK_HEAD_B); + + if ( ret_zs != sizeof(u3_book_head) ) { + u3l_log("book: failed to write header B: %s\r\n", + strerror(errno)); + return c3n; + } + + // extend file so it passes minimum size check on reopen + if ( -1 == ftruncate(txt_u->fid_i, BOOK_DEED_BASE) ) { + u3l_log("book: failed to extend file: %s\r\n", + strerror(errno)); + return c3n; + } + + if ( -1 == c3_sync(txt_u->fid_i) ) { + u3l_log("book: failed to sync headers: %s\r\n", + strerror(errno)); + return c3n; + } + + txt_u->act_w = 0; // start with slot A as active + + return c3y; +} + +/* _book_take_head(): select valid header from two candidates. +*/ +static c3_o +_book_take_head(const u3_book_head* hed_u, c3_o val_o, + const u3_book_head* deh_u, c3_o lav_o, + u3_book_head* out_u, c3_w* act_w) +{ + if ( c3y == val_o && c3y == lav_o ) { + if ( hed_u->seq_d >= deh_u->seq_d ) { + *out_u = *hed_u; + if ( act_w ) *act_w = 0; // A + } else { + *out_u = *deh_u; + if ( act_w ) *act_w = 1; // B + } + return c3y; + } + if ( c3y == val_o ) { + *out_u = *hed_u; + if ( act_w ) *act_w = 0; // A + return c3y; + } + if ( c3y == lav_o ) { + *out_u = *deh_u; + if ( act_w ) *act_w = 1; // B + return c3y; + } + return c3n; +} + +/* _book_read_head(): read both header slots and select valid one. +** +** reads both header slots, validates checksums, and selects the one +** with the higher sequence number. this implements the LMDB-style +** double-buffered commit protocol. +** +** on success, txt_u->hed_u contains the valid header and txt_u->act_w +** is set to the active slot index (0 or 1). +*/ +static c3_o +_book_read_head(u3_book* txt_u) +{ + u3_book_head hed_a, hed_b; + c3_o val_a, val_b; + c3_zs ret_zs; + + ret_zs = pread(txt_u->fid_i, &hed_a, sizeof(u3_book_head), BOOK_HEAD_A); + if ( ret_zs != sizeof(u3_book_head) ) { + fprintf(stderr, "book: failed to read header A\r\n"); + val_a = c3n; + } + else { + val_a = _book_head_okay(&hed_a); + } + + ret_zs = pread(txt_u->fid_i, &hed_b, sizeof(u3_book_head), BOOK_HEAD_B); + if ( ret_zs != sizeof(u3_book_head) ) { + fprintf(stderr, "book: failed to read header B\r\n"); + val_b = c3n; + } + else { + val_b = _book_head_okay(&hed_b); + } + + if ( c3n == _book_take_head(&hed_a, val_a, &hed_b, val_b, + &txt_u->hed_u, &txt_u->act_w) ) { + fprintf(stderr, "book: no valid header found\r\n"); + return c3n; + } + + return c3y; +} + +/* _book_deed_size(): calculate total on-disk size of deed. +*/ +static inline c3_d +_book_deed_size(c3_d len_d) +{ + return sizeof(c3_d) + len_d + sizeof(c3_d); +} + + +/* _book_read_deed(): read deed from file into [red_u]. +** +** returns: +** c3y: success, buf_y allocated with complete buffer +** c3n: failure (EOF or corruption) +** +** on success, caller must free red_u->buf_y +*/ +static c3_o +_book_read_deed(c3_i fid_i, c3_d* off_d, u3_book_reed* red_u) +{ + c3_zs ret_zs; + c3_d now_d = *off_d; + + c3_d len_d; + ret_zs = pread(fid_i, &len_d, sizeof(c3_d), now_d); + if ( ret_zs != sizeof(c3_d) ) { + return c3n; + } + now_d += sizeof(c3_d); + + red_u->buf_y = c3_malloc(len_d); + if ( !red_u->buf_y ) { + return c3n; + } + ret_zs = pread(fid_i, red_u->buf_y, len_d, now_d); + if ( ret_zs != (c3_zs)len_d ) { + c3_free(red_u->buf_y); + return c3n; + } + now_d += len_d; + + c3_d let_d; + ret_zs = pread(fid_i, &let_d, sizeof(c3_d), now_d); + if ( ret_zs != sizeof(c3_d) ) { + c3_free(red_u->buf_y); + return c3n; + } + now_d += sizeof(c3_d); + + if ( len_d != let_d ) { + c3_free(red_u->buf_y); + return c3n; + } + + red_u->len_d = len_d; + *off_d = now_d; + + return c3y; +} + +/* _book_skip_deed(): advance file offset past next deed without reading it. +*/ +static c3_o +_book_skip_deed(c3_i fid_i, c3_d* off_d) +{ + c3_zs ret_zs; + c3_d len_d; + + ret_zs = pread(fid_i, &len_d, sizeof(c3_d), *off_d); + if ( ret_zs != sizeof(c3_d) ) { + return c3n; + } + + *off_d += _book_deed_size(len_d); + + return c3y; +} + +/* _book_scan_back(): fast reverse scan to validate last deed. +** +** this is the fast path for normal startup. uses header's las_d +** as the authoritative last event number, and validates backward +** from file end using the trailing let_d field. +** +** on success: +** - sets *off_d to append offset (byte after last valid deed) +** - sets txt_u->las_d from header's las_d +** +** returns: +** c3y: last deed valid OR file is empty (no deeds) +** c3n: corruption detected (caller should fall back to _book_scan_fore) +** +** NB: does NOT truncate file or perform recovery; just reports state. +*/ +static c3_o +_book_scan_back(u3_book* txt_u, c3_d* off_d) +{ + struct stat buf_u; + c3_d end_d; + c3_d pos_d; + + if ( -1 == fstat(txt_u->fid_i, &buf_u) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + end_d = (c3_d)buf_u.st_size; + + // empty or header-only file is valid (no deeds yet) + if ( end_d <= BOOK_DEED_BASE ) { + *off_d = BOOK_DEED_BASE; + txt_u->las_d = txt_u->hed_u.las_d; + return c3y; + } + + // if header says no events, but file has data beyond header, + // that's uncommitted data - fall back to forward scan + if ( 0 == txt_u->hed_u.las_d ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + pos_d = end_d; + c3_d min_size = sizeof(u3_book_deed) + sizeof(c3_d); + + // validate last deed by reading its trailing length field + if ( pos_d < BOOK_DEED_BASE + min_size ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_zs ret_zs; + c3_d let_d; + + ret_zs = pread(txt_u->fid_i, &let_d, sizeof(c3_d), + pos_d - sizeof(c3_d)); + if ( ret_zs != sizeof(c3_d) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + // calculate deed size and start position + c3_d siz_d = _book_deed_size(let_d); + if ( siz_d > pos_d - BOOK_DEED_BASE ) { + // deed would extend before header + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_d ded_d = pos_d - siz_d; + + { + u3_book_reed red_u; + c3_d tmp_d = ded_d; + + if ( c3n == _book_read_deed(txt_u->fid_i, &tmp_d, &red_u) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + if ( 0 == red_u.len_d ) { + c3_free(red_u.buf_y); + *off_d = BOOK_DEED_BASE; + return c3n; + } + + // deed is valid — verify batch checksum before accepting + c3_free(red_u.buf_y); + + if ( txt_u->hed_u.bat_w > 0 ) { + // walk backward through bat_w deeds to find batch start + c3_d cur_d = pos_d; + + for ( c3_d i_d = 0; i_d < txt_u->hed_u.bat_w; i_d++ ) { + if ( cur_d < BOOK_DEED_BASE + sizeof(c3_d) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_d tet_d; + ret_zs = pread(txt_u->fid_i, &tet_d, sizeof(c3_d), + cur_d - sizeof(c3_d)); + if ( ret_zs != sizeof(c3_d) ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_d ded_d = _book_deed_size(tet_d); + if ( ded_d > cur_d - BOOK_DEED_BASE ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + cur_d -= ded_d; + } + + // read the batch region and verify checksum + c3_d byt_d = pos_d - cur_d; + c3_y* bat_y = c3_malloc(byt_d); + + if ( !bat_y ) { + *off_d = BOOK_DEED_BASE; + return c3n; + } + + ret_zs = pread(txt_u->fid_i, bat_y, byt_d, cur_d); + if ( ret_zs != (c3_zs)byt_d ) { + c3_free(bat_y); + *off_d = BOOK_DEED_BASE; + return c3n; + } + + c3_w sum_w = (c3_w)crc32(0, bat_y, byt_d); + c3_free(bat_y); + + if ( sum_w != txt_u->hed_u.sum_w ) { + fprintf(stderr, "book: batch checksum mismatch\r\n"); + *off_d = BOOK_DEED_BASE; + return c3n; + } + } + + *off_d = pos_d; + txt_u->las_d = txt_u->hed_u.las_d; + return c3y; + } +} + +/* _book_scan_fore(): recovery forward scan to find last valid deed. +** +** used as fallback when _book_scan_back fails (corruption recovery). +** validates each record's CRC and len_d == let_d sequentially. +** if corruption is found, truncates file and updates header. +** +** on completion: +** - sets *off_d to append offset +** - sets txt_u->las_d to last valid event number +** - truncates file if corrupted trailing data was found +** - updates header if recovery changed the count +** +** returns: +** c3y: always (recovery is best-effort) +*/ +static c3_o +_book_scan_fore(u3_book* txt_u, c3_d* off_d) +{ + c3_d cur_d = BOOK_DEED_BASE; // start of events + c3_d cot_d = 0; // count of valid deeds found + c3_d las_d = 0; // last valid event number found + c3_d exp_d; // expected event count from header + + if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->hed_u.las_d ) { + // empty log is valid (no deeds yet) + txt_u->las_d = 0; + *off_d = cur_d; + return c3y; + } + + // expected count based on header's las_d + // NB: fir_d is the epoch base; events are fir_d+1 through las_d + exp_d = ( txt_u->hed_u.las_d > txt_u->hed_u.fir_d ) + ? txt_u->hed_u.las_d - txt_u->hed_u.fir_d + : 0; + + while ( 1 ) { + u3_book_reed red_u; + c3_d beg_d = cur_d; + + if ( c3n == _book_read_deed(txt_u->fid_i, &cur_d, &red_u) ) { + break; + } + + if ( 0 == red_u.len_d ) { + u3l_log("book: validation failed at offset %" PRIu64 "\r\n", beg_d); + c3_free(red_u.buf_y); + break; + } + + // deed is valid - calculate its event number + // NB: first deed is event fir_d + 1 + las_d = txt_u->hed_u.fir_d + 1 + cot_d; + c3_free(red_u.buf_y); + cot_d++; + } + + // check if we found fewer events than header claims + if ( cot_d != exp_d ) { + u3l_log("book: recovery: found %" PRIu64 " events, expected %" PRIu64 "\r\n", + cot_d, exp_d); + + // update las_d based on what we found + if ( 0 == cot_d ) { + txt_u->las_d = 0; + las_d = 0; + cur_d = BOOK_DEED_BASE; + } else { + txt_u->las_d = las_d; + } + + // truncate file to remove invalid data + if ( -1 == ftruncate(txt_u->fid_i, cur_d) ) { + u3l_log("book: failed to truncate: %s\r\n", + strerror(errno)); + } else { + if ( -1 == c3_sync(txt_u->fid_i) ) { + u3l_log("book: failed to sync after truncate: %s\r\n", + strerror(errno)); + } + } + + // update header to match recovered state + txt_u->hed_u.las_d = las_d; + txt_u->hed_u.sum_w = 0; + txt_u->hed_u.bat_w = 0; + _book_save_head(txt_u); + } else { + txt_u->las_d = las_d; + } + + *off_d = cur_d; + return c3y; +} + +/* _book_check_batch(): verify batch integrity and roll back if corrupt. +** +** verifies that the latest batch of deeds matches the checksum +** stored in the header. if the checksum fails, truncates the +** file to remove the corrupt batch and updates the header. +** +** this protects against power failure where the header is flushed +** to disk but deed data is only partially written. +*/ +static void +_book_check_batch(u3_book* txt_u) +{ + if ( 0 == txt_u->hed_u.bat_w ) { + return; + } + + // walk backward through bat_w deeds to find batch start + c3_d cur_d = txt_u->off_d; + + for ( c3_d i_d = 0; i_d < txt_u->hed_u.bat_w; i_d++ ) { + if ( cur_d < BOOK_DEED_BASE + sizeof(c3_d) ) { + return; + } + + c3_d tet_d; + c3_zs ret_zs = pread(txt_u->fid_i, &tet_d, sizeof(c3_d), + cur_d - sizeof(c3_d)); + if ( ret_zs != sizeof(c3_d) ) { + return; + } + + c3_d ded_d = _book_deed_size(tet_d); + if ( ded_d > cur_d - BOOK_DEED_BASE ) { + return; + } + + cur_d -= ded_d; + } + + // read the batch region and verify checksum + c3_d byt_d = txt_u->off_d - cur_d; + c3_y* bat_y = c3_malloc(byt_d); + + if ( !bat_y ) { + return; + } + + c3_zs ret_zs = pread(txt_u->fid_i, bat_y, byt_d, cur_d); + if ( ret_zs != (c3_zs)byt_d ) { + c3_free(bat_y); + return; + } + + c3_w sum_w = (c3_w)crc32(0, bat_y, byt_d); + c3_free(bat_y); + + if ( sum_w == txt_u->hed_u.sum_w ) { + return; // checksum valid + } + + // batch is corrupt — roll back + u3l_log("book: batch checksum mismatch, rolling back\r\n"); + + // count valid events before the corrupt batch + c3_d pre_d = 0; + c3_d pos_d = BOOK_DEED_BASE; + + while ( pos_d < cur_d ) { + c3_d len_d; + ret_zs = pread(txt_u->fid_i, &len_d, sizeof(c3_d), pos_d); + if ( ret_zs != sizeof(c3_d) || 0 == len_d ) { + break; + } + + c3_d siz_d = _book_deed_size(len_d); + if ( pos_d + siz_d > cur_d ) { + break; + } + + pos_d += siz_d; + pre_d++; + } + + c3_d las_d = ( pre_d > 0 ) + ? txt_u->hed_u.fir_d + pre_d + : txt_u->hed_u.fir_d; + + // truncate and update state + if ( -1 != ftruncate(txt_u->fid_i, cur_d) ) { + c3_sync(txt_u->fid_i); + } + + txt_u->off_d = cur_d; + txt_u->las_d = las_d; + txt_u->hed_u.las_d = las_d; + txt_u->hed_u.sum_w = 0; + txt_u->hed_u.bat_w = 0; + _book_save_head(txt_u); +} + +/* _book_pull_epoc(): parse epoch number from directory path. +** +** expects path ending in "0iN" where N is the epoch number. +** +** returns: c3y on success with *epo_d set, c3n on failure +*/ +static c3_o +_book_pull_epoc(const c3_c* pax_c, c3_d* epo_d) +{ + const c3_c* las_c = strrchr(pax_c, '/'); + las_c = las_c ? las_c + 1 : pax_c; + + // expect "0iN" format + if ( strncmp(las_c, "0i", 2) != 0 || !las_c[2] ) { + fprintf(stderr, "book: init must be called with epoch directory\r\n"); + return c3n; + } + + errno = 0; + *epo_d = strtoull(las_c + 2, NULL, 10); + if ( errno == EINVAL ) { + fprintf(stderr, "book: invalid epoch number in path\r\n"); + return c3n; + } + + return c3y; +} + +/* u3_book_init(): open/create event log in epoch directory. +*/ +u3_book* +u3_book_init(const c3_c* pax_c) +{ + c3_c log_c[8193]; + c3_i met_i, fid_i = -1; + struct stat buf_u; + u3_book* txt_u = 0; + + snprintf(log_c, sizeof(log_c), "%s/book.log", pax_c); + + fid_i = c3_open(log_c, O_RDWR | O_CREAT, 0644); + if ( 0 > fid_i ) { + u3l_log("book: failed to open %s: %s\r\n", log_c, strerror(errno)); + return 0; + } + + met_i = _book_init_meta_file(pax_c); + if ( 0 > met_i ) { + u3l_log("book: failed to open meta.bin\r\n"); + goto fail1; + } + + if ( 0 > fstat(fid_i, &buf_u) ) { + u3l_log("book: fstat failed: %s\r\n", strerror(errno)); + goto fail2; + } + + txt_u = c3_calloc(sizeof(u3_book)); + txt_u->fid_i = fid_i; + txt_u->met_i = met_i; + txt_u->pax_c = c3_malloc(strlen(log_c) + 1); + if ( !txt_u->pax_c ) { + goto fail3; + } + strcpy(txt_u->pax_c, log_c); + + if ( buf_u.st_size == 0 ) { + // extract epoch number from path + c3_d epo_d; + if ( c3n == _book_pull_epoc(pax_c, &epo_d) ) { + goto fail3; + } + + // set epoch fields before writing header + txt_u->hed_u.fir_d = epo_d; + txt_u->hed_u.las_d = epo_d; + + // new file: initialize and write header + if ( c3n == _book_make_head(txt_u) ) { + goto fail4; + } + + txt_u->las_d = epo_d; + txt_u->off_d = BOOK_DEED_BASE; + } + else if ( buf_u.st_size < (off_t)BOOK_DEED_BASE ) { + // corrupt file: too small for headers + u3l_log("book: file too small: %lld bytes\r\n", (long long)buf_u.st_size); + goto fail4; + } + else { + // existing file: read and validate header + if ( c3n == _book_read_head(txt_u) ) { + goto fail4; + } + + // try fast reverse scan first + if ( c3n == _book_scan_back(txt_u, &txt_u->off_d) ) { + // fall back to forward scan for recovery + _book_scan_fore(txt_u, &txt_u->off_d); + } + + // verify latest batch integrity (catches content corruption + // that structural checks miss, e.g. header flushed but deeds not) + _book_check_batch(txt_u); + + // fir_d pre-initialized but no events found: set las_d to match + if ( txt_u->hed_u.fir_d && !txt_u->las_d ) { + txt_u->las_d = txt_u->hed_u.fir_d; + } + } + + return txt_u; + +fail4: + c3_free(txt_u->pax_c); +fail3: + c3_free(txt_u); +fail2: + close(met_i); +fail1: + close(fid_i); + return 0; +} + +/* u3_book_exit(): close event log and release resources. +*/ +void +u3_book_exit(u3_book* txt_u) +{ + if ( !txt_u ) { + return; + } + + close(txt_u->fid_i); + + if ( 0 <= txt_u->met_i ) { + close(txt_u->met_i); + } + + c3_free(txt_u->pax_c); + c3_free(txt_u); +} + +/* u3_book_gulf(): read first and last event numbers from log. +*/ +c3_o +u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d) +{ + if ( !txt_u ) { + return c3n; + } + + *low_d = txt_u->hed_u.fir_d; + *hig_d = txt_u->las_d; + + return c3y; +} + +void +u3_book_stat(const c3_c* log_c) +{ + c3_i fid_i; + u3_book_head hed_a, hed_b, hed_u; + c3_o val_a, val_b; + struct stat buf_u; + + fid_i = c3_open(log_c, O_RDONLY, 0); + if ( fid_i < 0 ) { + fprintf(stderr, "book: failed to open %s: %s\r\n", log_c, strerror(errno)); + return; + } + + c3_zs ret_zs; + ret_zs = pread(fid_i, &hed_a, sizeof(u3_book_head), BOOK_HEAD_A); + val_a = (ret_zs == sizeof(u3_book_head)) ? _book_head_okay(&hed_a) : c3n; + + ret_zs = pread(fid_i, &hed_b, sizeof(u3_book_head), BOOK_HEAD_B); + val_b = (ret_zs == sizeof(u3_book_head)) ? _book_head_okay(&hed_b) : c3n; + + if ( c3n == _book_take_head(&hed_a, val_a, &hed_b, val_b, &hed_u, 0) ) { + fprintf(stderr, "book: no valid header found\r\n"); + close(fid_i); + return; + } + + if ( fstat(fid_i, &buf_u) < 0 ) { + fprintf(stderr, "book: fstat failed\r\n"); + close(fid_i); + return; + } + + fprintf(stderr, "book info:\r\n"); + fprintf(stderr, " file: %s\r\n", log_c); + fprintf(stderr, " format: %u\r\n", hed_u.ver_w); + fprintf(stderr, " first event: %" PRIu64 "\r\n", hed_u.fir_d); + fprintf(stderr, " last event: %" PRIu64 "\r\n", hed_u.las_d); + fprintf(stderr, " sequence: %" PRIu64 "\r\n", hed_u.seq_d); + fprintf(stderr, " file size: %lld bytes\r\n", (long long)buf_u.st_size); + + u3_book_meta met_u; + c3_c* epo_c = 0; + { + const c3_c* sep_c = strrchr(log_c, '/'); + if ( sep_c && 0 == strcmp(sep_c, "/book.log") ) { + c3_z len_z = sep_c - log_c; + epo_c = c3_malloc(len_z + 1); + if ( epo_c ) { + memcpy(epo_c, log_c, len_z); + epo_c[len_z] = '\0'; + } + } + } + c3_c* met_c = epo_c ? _book_meta_path(epo_c) : 0; + c3_free(epo_c); + c3_i met_i = c3_open(met_c, O_RDONLY, 0); + + if ( met_i >= 0 ) { + c3_zs ret_zs = pread(met_i, &met_u, sizeof(u3_book_meta), 0); + if ( ret_zs == sizeof(u3_book_meta) ) { + fprintf(stderr, "\r\ndisk metadata:\r\n"); + fprintf(stderr, " who: %s\r\n", u3_ship_to_string(met_u.who_d)); + fprintf(stderr, " version: %u\r\n", met_u.ver_w); + fprintf(stderr, " fake: %s\r\n", _(met_u.fak_o) ? "yes" : "no"); + fprintf(stderr, " life: %u\r\n", met_u.lif_w); + } + close(met_i); + } + c3_free(met_c); + + close(fid_i); +} + +/* u3_book_save(): save [len_d] events starting at [eve_d]. +** +** uses double-buffered headers for single-fsync commits: +** 1. write deed data +** 2. write updated header to INACTIVE slot +** 3. single fsync makes both durable atomically +*/ + +static_assert(sizeof(c3_d) == sizeof(c3_z)); + +c3_o +u3_book_save(u3_book* txt_u, + c3_d eve_d, // first event + c3_d len_d, // number of events + void** byt_p, // array of bytes + c3_z* siz_i, // array of lengths + c3_d epo_d) // target epoch +{ + if ( !txt_u ) { + return c3n; + } + + // validate contiguity + if ( 0 == txt_u->hed_u.fir_d && 0 == txt_u->las_d ) { + // empty log: first event must be the first event in the epoch + if ( epo_d + 1 != eve_d ) { + fprintf(stderr, "book: first event must be start of epoch, " + "expected %" PRIu64 ", got %" PRIu64 + "\r\n", epo_d + 1, eve_d); + return c3n; + } + // fir_d is the epoch base (last event before this epoch) + txt_u->hed_u.fir_d = epo_d; + } + else { + // non-empty: must be contiguous + if ( eve_d != txt_u->las_d + 1 ) { + fprintf(stderr, "book: event gap: expected %" PRIu64 ", got %" PRIu64 "\r\n", + txt_u->las_d + 1, eve_d); + return c3n; + } + } + + // batch write all deeds using scatter-gather I/O + // + // for each deed we need 3 iovec entries: len_d + buffer + let_d + // pwritev has IOV_MAX limit (typically 1024), so we chunk if needed + // + #ifdef IOV_MAX + const c3_d max_ded_d = IOV_MAX / 3; + #else + const c3_d max_ded_d = 1020 / 3; + #endif + + struct iovec iov_u[max_ded_d * 3]; + c3_d now_d = txt_u->off_d; + c3_d dun_d = 0; + c3_w chk_w = (c3_w)crc32(0, Z_NULL, 0); + + while ( dun_d < len_d ) { + c3_d cun_d = c3_min(len_d - dun_d, max_ded_d); + + c3_z cun_z = 0; + for ( c3_d i_d = 0; i_d < cun_d; i_d++ ) { + c3_d src_d = dun_d + i_d; + c3_d idx_d = i_d * 3; + c3_y* buf_y = (c3_y*)byt_p[src_d]; + + iov_u[idx_d + 0].iov_base = &siz_i[src_d]; + iov_u[idx_d + 0].iov_len = sizeof(c3_d); + iov_u[idx_d + 1].iov_base = buf_y; + iov_u[idx_d + 1].iov_len = siz_i[src_d]; + iov_u[idx_d + 2].iov_base = &siz_i[src_d]; + iov_u[idx_d + 2].iov_len = sizeof(c3_d); + + chk_w = (c3_w)crc32(chk_w, (const c3_y*)&siz_i[src_d], sizeof(c3_d)); + chk_w = (c3_w)crc32(chk_w, buf_y, siz_i[src_d]); + chk_w = (c3_w)crc32(chk_w, (const c3_y*)&siz_i[src_d], sizeof(c3_d)); + + cun_z += sizeof(c3_d) + siz_i[src_d] + sizeof(c3_d); + } + + c3_zs ret_zs = pwritev(txt_u->fid_i, iov_u, cun_d * 3, now_d); + + if ( ret_zs != (c3_zs)cun_z ) { + fprintf(stderr, "book: batch write failed: wrote %zd of %zu bytes: %s\r\n", + ret_zs, cun_z, strerror(errno)); + return c3n; + } + + now_d += cun_z; + dun_d += cun_d; + } + + c3_d new_las_d = eve_d + len_d - 1; + txt_u->hed_u.las_d = new_las_d; + txt_u->hed_u.sum_w = chk_w; + txt_u->hed_u.bat_w = len_d; + + // commit header: write to inactive slot, fsync, swap active + if ( c3n == _book_save_head(txt_u) ) { + return c3n; + } + + txt_u->las_d = new_las_d; + txt_u->off_d = now_d; + + return c3y; +} + +/* u3_book_read(): read events from log, invoking callback for each event. +*/ +c3_o +u3_book_read(u3_book* txt_u, + void* ptr_v, + c3_d eve_d, + c3_d len_d, + c3_o (*read_f)(void*, c3_d, c3_z, void*)) +{ + c3_d off_d; + c3_d cur_d; + + if ( !txt_u ) { + return c3n; + } + + if ( 0 == txt_u->las_d ) { + fprintf(stderr, "book: read from empty log\r\n"); + return c3n; + } + + // NB: fir_d is the epoch base; first stored event is fir_d + 1 + if ( eve_d <= txt_u->hed_u.fir_d || eve_d > txt_u->las_d ) { + fprintf(stderr, "book: event %" PRIu64 " out of range (%" PRIu64 ", %" PRIu64 "]\r\n", + eve_d, txt_u->hed_u.fir_d, txt_u->las_d); + return c3n; + } + + if ( eve_d + len_d - 1 > txt_u->las_d ) { + fprintf(stderr, "book: read range exceeds last event\r\n"); + return c3n; + } + + // NB: fir_d is the epoch base; first deed is event fir_d + 1 + off_d = BOOK_DEED_BASE; + cur_d = txt_u->hed_u.fir_d + 1; + + while ( cur_d < eve_d ) { + if ( c3n == _book_skip_deed(txt_u->fid_i, &off_d) ) { + fprintf(stderr, "book: failed to scan to event %" PRIu64 "\r\n", eve_d); + return c3n; + } + cur_d++; + } + + for ( c3_d i_d = 0; i_d < len_d; i_d++, cur_d++ ) { + u3_book_reed red_u; + c3_y* buf_y; + c3_z len_z; + + if ( c3n == _book_read_deed(txt_u->fid_i, &off_d, &red_u) ) { + fprintf(stderr, "book: failed to read event %" PRIu64 "\r\n", cur_d); + return c3n; + } + + if ( 0 == red_u.len_d ) { + fprintf(stderr, "book: validation failed at event %" PRIu64 "\r\n", cur_d); + c3_free(red_u.buf_y); + return c3n; + } + + len_z = red_u.len_d; + buf_y = red_u.buf_y; + + if ( c3n == read_f(ptr_v, cur_d, len_z, buf_y) ) { + c3_free(buf_y); + return c3n; + } + + c3_free(buf_y); + } + + return c3y; +} + +void +u3_book_read_meta(u3_book* txt_u, + void* ptr_v, + const c3_c* key_c, + void (*read_f)(void*, c3_zs, void*)) +{ + u3_book_meta met_u; + + if ( !txt_u ) { + read_f(ptr_v, -1, 0); + return; + } + + if ( c3n == _book_read_meta_file(txt_u->met_i, &met_u) ) { + u3l_log("book: read_meta: failed to read metadata\r\n"); + read_f(ptr_v, -1, 0); + return; + } + + if ( 0 == strcmp(key_c, "version") ) { + read_f(ptr_v, sizeof(c3_w), &met_u.ver_w); + } + else if ( 0 == strcmp(key_c, "who") ) { + read_f(ptr_v, sizeof(c3_d[2]), met_u.who_d); + } + else if ( 0 == strcmp(key_c, "fake") ) { + read_f(ptr_v, sizeof(c3_o), &met_u.fak_o); + } + else if ( 0 == strcmp(key_c, "life") ) { + read_f(ptr_v, sizeof(c3_w), &met_u.lif_w); + } + else { + read_f(ptr_v, -1, 0); + } +} + +/* u3_book_save_meta(): write fixed metadata section. +*/ +c3_o +u3_book_save_meta(u3_book* txt_u, + const c3_c* key_c, + c3_z val_z, + void* val_p) +{ + u3_book_meta met_u; + + if ( !txt_u ) { + return c3n; + } + + if ( c3n == _book_read_meta_file(txt_u->met_i, &met_u) ) { + u3l_log("book: save_meta: failed to read current metadata\r\n"); + return c3n; + } + + if ( 0 == strcmp(key_c, "version") ) { + if ( val_z != sizeof(c3_w) ) return c3n; + memcpy(&met_u.ver_w, val_p, val_z); + } + else if ( 0 == strcmp(key_c, "who") ) { + if ( val_z != sizeof(c3_d[2]) ) return c3n; + memcpy(met_u.who_d, val_p, val_z); + } + else if ( 0 == strcmp(key_c, "fake") ) { + if ( val_z != sizeof(c3_o) ) return c3n; + memcpy(&met_u.fak_o, val_p, val_z); + } + else if ( 0 == strcmp(key_c, "life") ) { + if ( val_z != sizeof(c3_w) ) return c3n; + memcpy(&met_u.lif_w, val_p, val_z); + } + else { + return c3n; + } + + if ( c3n == _book_save_meta_file(txt_u->met_i, &met_u) ) { + u3l_log("book: save_meta: failed to write metadata\r\n"); + return c3n; + } + + return c3y; +} + +/* u3_book_walk_init(): initialize event iterator. +*/ +c3_o +u3_book_walk_init(u3_book* txt_u, + u3_book_walk* itr_u, + c3_d nex_d, + c3_d las_d) +{ + c3_d off_d; + c3_d cur_d; + + if ( !txt_u || !itr_u ) { + return c3n; + } + + if ( 0 == txt_u->las_d ) { + fprintf(stderr, "book: walk_init on empty log\r\n"); + return c3n; + } + + // NB: fir_d is the epoch base; first stored event is fir_d + 1 + if ( nex_d <= txt_u->hed_u.fir_d || nex_d > txt_u->las_d ) { + fprintf(stderr, "book: walk_init start %" PRIu64 " out of range (%" PRIu64 ", %" PRIu64 "]\r\n", + nex_d, txt_u->hed_u.fir_d, txt_u->las_d); + return c3n; + } + + if ( las_d < nex_d || las_d > txt_u->las_d ) { + fprintf(stderr, "book: walk_init end %" PRIu64 " out of range [%" PRIu64 ", %" PRIu64 "]\r\n", + las_d, nex_d, txt_u->las_d); + return c3n; + } + + // NB: fir_d is the epoch base; first deed is event fir_d + 1 + off_d = BOOK_DEED_BASE; + cur_d = txt_u->hed_u.fir_d + 1; + + while ( cur_d < nex_d ) { + if ( c3n == _book_skip_deed(txt_u->fid_i, &off_d) ) { + fprintf(stderr, "book: walk_init failed to scan to event %" PRIu64 "\r\n", nex_d); + return c3n; + } + cur_d++; + } + + itr_u->fid_i = txt_u->fid_i; + itr_u->nex_d = nex_d; + itr_u->las_d = las_d; + itr_u->off_d = off_d; + itr_u->liv_o = c3y; + + return c3y; +} + +/* u3_book_walk_next(): read next event from iterator. +** +** allocates buffer for event (caller must free). +** returns c3n when no more events or error. +*/ +c3_o +u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v) +{ + u3_book_reed red_u; + c3_y* buf_y; + + if ( !itr_u || c3n == itr_u->liv_o ) { + return c3n; + } + + if ( itr_u->nex_d > itr_u->las_d ) { + itr_u->liv_o = c3n; + return c3n; + } + + if ( c3n == _book_read_deed(itr_u->fid_i, &itr_u->off_d, &red_u) ) { + fprintf(stderr, "book: walk_next failed to read event %" PRIu64 "\r\n", + itr_u->nex_d); + itr_u->liv_o = c3n; + return c3n; + } + + if ( 0 == red_u.len_d ) { + fprintf(stderr, "book: walk_next validation failed at event %" PRIu64 "\r\n", + itr_u->nex_d); + c3_free(red_u.buf_y); + itr_u->liv_o = c3n; + return c3n; + } + + *len_z = red_u.len_d; + buf_y = red_u.buf_y; + + *buf_v = buf_y; + itr_u->nex_d++; + + return c3y; +} + +/* u3_book_walk_done(): close iterator. +*/ +void +u3_book_walk_done(u3_book_walk* itr_u) +{ + if ( !itr_u ) { + return; + } + + itr_u->liv_o = c3n; + itr_u->fid_i = -1; +} diff --git a/pkg/vere/db/book.h b/pkg/vere/db/book.h new file mode 100644 index 0000000000..6c6b904e12 --- /dev/null +++ b/pkg/vere/db/book.h @@ -0,0 +1,173 @@ +/// @file + +#ifndef U3_VERE_DB_BOOK_H +#define U3_VERE_DB_BOOK_H + +#include "c3/c3.h" + + /* book: mostly append-only event log + ** + ** uses double-buffered headers for single-fsync commits (like LMDB) + ** two header slots alternate; the one with higher valid seq_d is current + */ + /* u3_book_head: on-disk file header (48 bytes, page-aligned slots) + ** + ** fir_d is write-once (set on first event save) + ** las_d is updated after each batch of events is committed + ** seq_d is monotonically increasing; determines which slot is current + ** bat_w is the number of deeds in the latest batch written + ** sum_w is CRC32 of the latest batch of deeds (for integrity check) + ** crc_w is CRC32 of preceding fields to detect partial writes + ** + ** two header slots at offsets 0 and 4096; deeds start at 8192 + */ + typedef struct _u3_book_head { + c3_w mag_w; // magic number: 0x424f4f4b ("BOOK") + c3_w ver_w; // format version: 1 + c3_d fir_d; // first event number in file + c3_d las_d; // last event number (commit marker) + c3_d seq_d; // sequence number (for double-buffer) + c3_d bat_w; // number of deeds in latest batch + c3_w sum_w; // CRC32 of latest deed batch data + c3_w crc_w; // CRC32 checksum (of preceding fields) + } u3_book_head; + + /* u3_book_meta: on-disk metadata format (fixed 256 bytes) + ** + ** layout: + ** [4 bytes] version + ** [16 bytes] who_d (c3_d[2], identity) + ** [1 byte] fak_o (fake security bit) + ** [4 bytes] lif_w (lifecycle length) + ** [231 bytes] reserved for future use + ** + ** total: 256 bytes + */ + typedef struct _u3_book_meta { + c3_d who_d[2]; // ship identity (16 bytes) + c3_w ver_w; // metadata format version + c3_w lif_w; // lifecycle length (4 bytes) + c3_o fak_o; // fake security flag (1 byte) + c3_y pad_y[231]; // reserved (231 bytes) + } u3_book_meta; + + /* u3_book: event log handle + */ + typedef struct _u3_book { + c3_i fid_i; // file descriptor for book.log + c3_i met_i; // file descriptor for meta.bin + c3_c* pax_c; // file path to book.log + u3_book_head hed_u; // cached header (current valid state) + c3_d las_d; // cached last event number + c3_d off_d; // cached append offset (end of last event) + c3_w act_w; // active header slot a or b (0 or 1) + } u3_book; + + /* u3_book_walk: event iterator + */ + typedef struct _u3_book_walk { + c3_i fid_i; // file descriptor + c3_d nex_d; // next event number to read + c3_d las_d; // last event number, inclusive + c3_d off_d; // current file offset + c3_o liv_o; // iterator valid + } u3_book_walk; + + /* u3_book_deed: on-disk event record + ** + ** on-disk format: len_d | buffer_data | let_d + ** where buffer_data is len_d bytes of opaque buffer data + ** and let_d echoes len_d for validation (used for backward scanning) + ** + ** NB: not used directly for I/O due to variable-length buffer data + */ + typedef struct _u3_book_deed { + c3_d len_d; // buffer size (bytes) + // c3_y buf_y[]; // variable-length buffer data + c3_d let_d; // length trailer (echoes len_d, used for backward scanning) + } u3_book_deed; + + /* u3_book_reed: in-memory event record representation for I/O + ** + ** represents a complete event buffer including any prefixes. + ** the book API treats buffers as opaque byte arrays. + */ + typedef struct _u3_book_reed { + c3_d len_d; // total buffer size (bytes) + c3_y* buf_y; // complete buffer (caller owns) + } u3_book_reed; + + /* u3_book_init(): open/create event log at [pax_c]. + */ + u3_book* + u3_book_init(const c3_c* pax_c); + + /* u3_book_exit(): close event log. + */ + void + u3_book_exit(u3_book* txt_u); + + /* u3_book_stat(): print book stats. + */ + void + u3_book_stat(const c3_c* pax_c); + + /* u3_book_gulf(): read first and last event numbers. + */ + c3_o + u3_book_gulf(u3_book* txt_u, c3_d* low_d, c3_d* hig_d); + + /* u3_book_read(): read [len_d] events starting at [eve_d]. + */ + c3_o + u3_book_read(u3_book* txt_u, + void* ptr_v, + c3_d eve_d, + c3_d len_d, + c3_o (*read_f)(void*, c3_d, c3_z, void*)); + + /* u3_book_save(): save [len_d] events starting at [eve_d]. + */ + c3_o + u3_book_save(u3_book* txt_u, + c3_d eve_d, + c3_d len_d, + void** byt_p, + c3_z* siz_i, + c3_d epo_d); + + /* u3_book_read_meta(): read fixed metadata section. + */ + void + u3_book_read_meta(u3_book* txt_u, + void* ptr_v, + const c3_c* key_c, + void (*read_f)(void*, c3_zs, void*)); + + /* u3_book_save_meta(): write fixed metadata section. + */ + c3_o + u3_book_save_meta(u3_book* txt_u, + const c3_c* key_c, + c3_z val_z, + void* val_p); + + /* u3_book_walk_init(): initialize event iterator. + */ + c3_o + u3_book_walk_init(u3_book* txt_u, + u3_book_walk* itr_u, + c3_d nex_d, + c3_d las_d); + + /* u3_book_walk_next(): read next event from iterator. + */ + c3_o + u3_book_walk_next(u3_book_walk* itr_u, c3_z* len_z, void** buf_v); + + /* u3_book_walk_done(): close iterator. + */ + void + u3_book_walk_done(u3_book_walk* itr_u); + +#endif /* ifndef U3_VERE_DB_BOOK_H */ diff --git a/pkg/vere/disk.c b/pkg/vere/disk.c index 19a03b5372..7263f746d3 100644 --- a/pkg/vere/disk.c +++ b/pkg/vere/disk.c @@ -4,6 +4,7 @@ #include "events.h" #include "vere.h" #include "version.h" +#include "db/book.h" #include "db/lmdb.h" #include @@ -11,7 +12,7 @@ #include "v4.h" struct _u3_disk_walk { - u3_lmdb_walk itr_u; + u3_book_walk itr_u; u3_disk* log_u; c3_o liv_o; }; @@ -92,11 +93,12 @@ _disk_commit_cb(uv_work_t* ted_u) { u3_disk* log_u = ted_u->data; - log_u->sav_u.ret_o = u3_lmdb_save(log_u->mdb_u, + log_u->sav_u.ret_o = u3_book_save(log_u->txt_u, log_u->sav_u.eve_d, log_u->sav_u.len_w, (void**)log_u->sav_u.byt_y, - log_u->sav_u.siz_i); + log_u->sav_u.siz_i, + log_u->epo_d); } /* _disk_commit_start(): queue async event-batch write. @@ -273,11 +275,12 @@ u3_disk_sync(u3_disk* log_u) // XX max 100 // if ( c3y == _disk_batch(log_u) ) { - ret_o = u3_lmdb_save(log_u->mdb_u, + ret_o = u3_book_save(log_u->txt_u, log_u->sav_u.eve_d, log_u->sav_u.len_w, (void**)log_u->sav_u.byt_y, - log_u->sav_u.siz_i); + log_u->sav_u.siz_i, + log_u->epo_d); log_u->sav_u.ret_o = ret_o; @@ -373,7 +376,7 @@ u3_disk_read_list(u3_disk* log_u, c3_d eve_d, c3_d len_d, c3_l* mug_l) { struct _cd_list ven_u = { log_u, u3_nul, 0 }; - if ( c3n == u3_lmdb_read(log_u->mdb_u, &ven_u, + if ( c3n == u3_book_read(log_u->txt_u, &ven_u, eve_d, len_d, _disk_read_list_cb) ) { // XX test normal (not subcommand) replay with and without, @@ -397,7 +400,7 @@ u3_disk_walk_init(u3_disk* log_u, c3_d max_d = eve_d + len_d - 1; wok_u->log_u = log_u; - wok_u->liv_o = u3_lmdb_walk_init(log_u->mdb_u, + wok_u->liv_o = u3_book_walk_init(log_u->txt_u, &wok_u->itr_u, eve_d, c3_min(max_d, log_u->dun_d)); @@ -433,7 +436,7 @@ u3_disk_walk_step(u3_disk_walk* wok_u, u3_fact* tac_u) tac_u->eve_d = wok_u->itr_u.nex_d; - if ( c3n == u3_lmdb_walk_next(&wok_u->itr_u, &len_i, &buf_v) ) { + if ( c3n == u3_book_walk_next(&wok_u->itr_u, &len_i, &buf_v) ) { fprintf(stderr, "disk: (%" PRIu64 "): read fail\r\n", tac_u->eve_d); return wok_u->liv_o = c3n; } @@ -455,25 +458,11 @@ u3_disk_walk_step(u3_disk_walk* wok_u, u3_fact* tac_u) void u3_disk_walk_done(u3_disk_walk* wok_u) { - u3_lmdb_walk_done(&wok_u->itr_u); + u3_book_walk_done(&wok_u->itr_u); c3_free(wok_u); } -/* _disk_save_meta(): serialize atom, save as metadata at [key_c]. -*/ -static c3_o -_disk_save_meta(MDB_env* mdb_u, const c3_c* key_c, c3_w len_w, c3_y* byt_y) -{ - // strip trailing zeroes. - // - while ( len_w && !byt_y[len_w - 1] ) { - len_w--; - } - - return u3_lmdb_save_meta(mdb_u, key_c, len_w, byt_y); -} - -/* u3_disk_save_meta(): save metadata. +/* u3_disk_save_meta(): save metadata to lmdb. */ c3_o u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) @@ -482,10 +471,10 @@ u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) u3_noun who = u3i_chubs(2, met_u->who_d); - if ( (c3n == _disk_save_meta(mdb_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) - || (c3n == _disk_save_meta(mdb_u, "who", 2 * sizeof(c3_d), (c3_y*)met_u->who_d)) - || (c3n == _disk_save_meta(mdb_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) - || (c3n == _disk_save_meta(mdb_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) + if ( (c3n == u3_lmdb_save_meta(mdb_u, "version", sizeof(c3_w), (c3_y*)&met_u->ver_w)) + || (c3n == u3_lmdb_save_meta(mdb_u, "who", sizeof(met_u->who_d), (c3_y*)met_u->who_d)) + || (c3n == u3_lmdb_save_meta(mdb_u, "fake", sizeof(c3_o), (c3_y*)&met_u->fak_o)) + || (c3n == u3_lmdb_save_meta(mdb_u, "life", sizeof(c3_w), (c3_y*)&met_u->lif_w)) ) { u3z(who); return c3n; @@ -496,24 +485,25 @@ u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u) } -/* u3_disk_save_meta_meta(): save meta metadata. +/* u3_disk_save_meta_meta(): save meta metadata using lmdb. */ c3_o u3_disk_save_meta_meta(c3_c* log_c, const u3_meta* met_u) { - MDB_env* dbm_u; + MDB_env* mdb_u; - if ( 0 == (dbm_u = u3_lmdb_init(log_c, u3_Host.ops_u.siz_i)) ) { - fprintf(stderr, "disk: failed to initialize meta-lmdb\r\n"); + if ( 0 == (mdb_u = u3_lmdb_init(log_c, 1ULL << 30)) ) { + fprintf(stderr, "disk: failed to initialize lmdb for metadata\r\n"); return c3n; } - if ( c3n == u3_disk_save_meta(dbm_u, met_u) ) { + if ( c3n == u3_disk_save_meta(mdb_u, met_u) ) { fprintf(stderr, "disk: failed to save metadata\r\n"); + u3_lmdb_exit(mdb_u); return c3n; } - u3_lmdb_exit(dbm_u); + u3_lmdb_exit(mdb_u); return c3y; } @@ -541,7 +531,7 @@ _disk_meta_read_cb(void* ptr_v, ssize_t val_i, void* val_v) } } -/* u3_disk_read_meta(): read metadata. +/* u3_disk_read_meta(): read metadata from lmdb. */ c3_o u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) @@ -654,8 +644,8 @@ u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u) } } - // NB: we read metadata from LMDB even when met_u is null because sometimes - // because sometimes we call this just to ensure metadata exists + // NB: we read metadata from lmdb even when met_u is null because sometimes + // we call this just to ensure metadata exists if ( met_u ) { met_u->ver_w = ver_w; memcpy(met_u->who_d, who_d, 2 * sizeof(c3_d)); @@ -827,9 +817,19 @@ u3_disk_exit(u3_disk* log_u) return; } - // close database + // close lmdb metadata environment (if still open) // - u3_lmdb_exit(log_u->mdb_u); + if ( log_u->mdb_u ) { + u3_lmdb_exit(log_u->mdb_u); + log_u->mdb_u = 0; + } + + // close epoch event log (book) + // + if ( log_u->txt_u ) { + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; + } // dispose planned writes // @@ -1147,26 +1147,33 @@ _disk_epoc_roll(u3_disk* log_u, c3_d epo_d) } #endif - // get metadata from old log, update version + // get metadata from top-level lmdb, update version u3_meta old_u; if ( c3y != u3_disk_read_meta(log_u->mdb_u, &old_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); goto fail3; } - u3_lmdb_exit(log_u->mdb_u); - log_u->mdb_u = 0; + + // close old epoch book if still open + if ( log_u->txt_u ) { + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; + } // initialize db of new epoch - if ( 0 == (log_u->mdb_u = u3_lmdb_init(epo_c, u3_Host.ops_u.siz_i)) ) { + if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database\r\n"); c3_free(log_u); goto fail3; } - // write the metadata to the database + // write the metadata to the epoch's book old_u.ver_w = U3D_VERLAT; - if ( c3n == u3_disk_save_meta(log_u->mdb_u, &old_u) ) { - fprintf(stderr, "disk: failed to save metadata\r\n"); + if ( c3n == u3_book_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&old_u.ver_w) + || c3n == u3_book_save_meta(log_u->txt_u, "who", sizeof(old_u.who_d), (c3_y*)old_u.who_d) + || c3n == u3_book_save_meta(log_u->txt_u, "fake", sizeof(c3_o), (c3_y*)&old_u.fak_o) + || c3n == u3_book_save_meta(log_u->txt_u, "life", sizeof(c3_w), (c3_y*)&old_u.lif_w) ) { + fprintf(stderr, "disk: failed to save metadata to epoch\r\n"); goto fail3; } @@ -1339,17 +1346,17 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) * 6. open epoch lmdb and set it in log_u */ - // NB: requires that log_u->mdb_u is initialized to log/data.mdb + // NB: requires that log_u->txt_u is initialized to log/data.mdb // XX: put old log in separate pointer (old_u?)? - // get metadata from old log, update version + // get metadata from top-level lmdb, update version u3_meta olm_u; if ( c3y != u3_disk_read_meta(log_u->mdb_u, &olm_u) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); return c3n; } - // finish with old log + // finish with old log lmdb (will be re-initialized for epoch) u3_lmdb_exit(log_u->mdb_u); log_u->mdb_u = 0; @@ -1418,22 +1425,25 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) return c3n; } - if ( 0 == (log_u->mdb_u = u3_lmdb_init(tmp_c, u3_Host.ops_u.siz_i)) ) { + if ( 0 == (log_u->txt_u = u3_book_init(tmp_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", tmp_c); return c3n; } olm_u.ver_w = U3D_VERLAT; - if ( c3n == u3_disk_save_meta(log_u->mdb_u, &olm_u) ) { - fprintf(stderr, "disk: failed to save metadata\r\n"); + if ( c3n == u3_book_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&olm_u.ver_w) + || c3n == u3_book_save_meta(log_u->txt_u, "who", sizeof(olm_u.who_d), (c3_y*)olm_u.who_d) + || c3n == u3_book_save_meta(log_u->txt_u, "fake", sizeof(c3_o), (c3_y*)&olm_u.fak_o) + || c3n == u3_book_save_meta(log_u->txt_u, "life", sizeof(c3_w), (c3_y*)&olm_u.lif_w) ) { + fprintf(stderr, "disk: failed to save metadata to book\r\n"); return c3n; } // atomic truncation of old log // - u3_lmdb_exit(log_u->mdb_u); - log_u->mdb_u = 0; + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; c3_c trd_c[8193]; snprintf(trd_c, sizeof(trd_c), "%s/data.mdb", tmp_c); @@ -1453,7 +1463,7 @@ _disk_migrate_epoc(u3_disk* log_u, c3_d eve_d) strerror(errno)); } - if ( 0 == (log_u->mdb_u = u3_lmdb_init(epo_c, u3_Host.ops_u.siz_i)) ) { + if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { fprintf(stderr, "disk: failed to initialize database at %s\r\n", epo_c); return c3n; @@ -1529,7 +1539,7 @@ u3_disk_roll(u3_disk* log_u, c3_d eve_d) // XX get fir_d from log_u c3_d fir_d, las_d; - if ( c3n == u3_lmdb_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { fprintf(stderr, "roll: failed to read first/last event numbers\r\n"); exit(1); } @@ -1679,7 +1689,7 @@ static void _disk_migrate_old(u3_disk* log_u) { c3_d fir_d, las_d; - if ( c3n == u3_lmdb_gulf(log_u->mdb_u, &fir_d, &las_d) ) { + if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); exit(1); } @@ -1690,9 +1700,9 @@ _disk_migrate_old(u3_disk* log_u) case U3D_VER1: { _disk_migrate_loom(log_u->dir_u->pax_c, las_d); - // set version to 2 (migration in progress) + // set version to 2 (migration in progress) in top-level lmdb log_u->ver_w = U3D_VER2; - if ( c3n == _disk_save_meta(log_u->mdb_u, "version", 4, (c3_y*)&log_u->ver_w) ) { + if ( c3n == u3_lmdb_save_meta(log_u->mdb_u, "version", sizeof(c3_w), (c3_y*)&log_u->ver_w) ) { fprintf(stderr, "disk: failed to set version to 2\r\n"); exit(1); } @@ -1768,31 +1778,57 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) c3_c epo_c[8193]; snprintf(epo_c, 8192, "%s/0i%" PRIc3_d, log_u->com_u->pax_c, lat_d); - // initialize latest epoch's db - if ( 0 == (log_u->mdb_u = u3_lmdb_init(epo_c, u3_Host.ops_u.siz_i)) ) { - fprintf(stderr, "disk: failed to initialize database at %s\r\n", - epo_c); - return _epoc_fail; + // for U3E_VER1 and U3E_VER2 epochs, we need special handling + // both use lmdb format, but the new system uses book.log + // we read metadata from the old lmdb then trigger migration via rollover + c3_d fir_d, las_d; + + if ( U3E_VER2 >= ver_w ) { + // open with lmdb temporarily to get first and last events + MDB_env* mdb_u = u3_lmdb_init(epo_c, 1ULL << 30); + if ( 0 == mdb_u ) { + fprintf(stderr, "disk: failed to initialize lmdb at %s\r\n", epo_c); + return _epoc_fail; + } + + // get first/last event numbers from lmdb + if ( c3n == u3_lmdb_gulf(mdb_u, &fir_d, &las_d) ) { + fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); + u3_lmdb_exit(mdb_u); + return _epoc_fail; + } + + u3_lmdb_exit(mdb_u); + + // store null for txt_u to indicate lmdb-format epoch (will need migration) + log_u->txt_u = 0; } + else { + // initialize latest epoch's db for U3E_VER3+ (book format) + if ( 0 == (log_u->txt_u = u3_book_init(epo_c)) ) { + fprintf(stderr, "disk: failed to initialize database at %s\r\n", + epo_c); + return _epoc_fail; + } - fprintf(stderr, "disk: loaded epoch 0i%" PRIc3_d "\r\n", lat_d); + fprintf(stderr, "disk: loaded epoch 0i%" PRIc3_d "\r\n", lat_d); - // get first/last event numbers from lmdb - c3_d fir_d, las_d; - if ( c3n == u3_lmdb_gulf(log_u->mdb_u, &fir_d, &las_d) ) { - fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); - u3_lmdb_exit(log_u->mdb_u); - log_u->mdb_u = 0; - return _epoc_fail; + // get first/last event numbers from book + if ( c3n == u3_book_gulf(log_u->txt_u, &fir_d, &las_d) ) { + fprintf(stderr, "disk: failed to get first/last event numbers\r\n"); + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; + return _epoc_fail; + } } if ( (u3_dlod_boot != lod_e) && !fir_d - && !las_d - && (c3n == u3_disk_read_meta(log_u->mdb_u, 0)) ) + && !las_d ) { - u3_lmdb_exit(log_u->mdb_u); - log_u->mdb_u = 0; + // empty epoch (no events and no metadata) + u3_book_exit(log_u->txt_u); + log_u->txt_u = 0; return _epoc_void; } @@ -1809,6 +1845,12 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) // switch ( ver_w ) { case U3E_VER1: { + // migration from U3E_VER1 (lmdb with loom files) to U3E_VER3 (book.log) + // txt_u is null for U3E_VER1 since we can't keep lmdb epoch open + // we must perform loom migration and then rollover to new format epoch + // + fprintf(stderr, "disk: epoch v1 detected, migrating to v3...\r\n"); + if ( u3_dlod_epoc == lod_e ) { fprintf(stderr, "migration required, replay disallowed\r\n"); exit(1); @@ -1824,10 +1866,18 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) } _disk_unlink_stale_loom(log_u->dir_u->pax_c); + fprintf(stderr, "disk: epoch v3 migration done\r\n"); + return _epoc_good; } break; case U3E_VER2: { + // migration from U3E_VER2 (data.mdb) to U3E_VER3 (book.log) + // txt_u is null for U3E_VER2 since we can't keep lmdb epoch open + // we must trigger an immediate rollover to create the new format epoch + // + fprintf(stderr, "disk: epoch v2 detected, migrating to v3...\r\n"); + if ( u3_dlod_epoc == lod_e ) { c3_c chk_c[8193]; snprintf(chk_c, 8193, "%s/.urb/chk", log_u->dir_u->pax_c); @@ -1866,10 +1916,22 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) exit(1); } - if ( (u3C.wag_w & u3o_yolo) // XX better argument to disable autoroll - || (!log_u->epo_d && log_u->dun_d && !u3A->eve_d) - || (c3n == _disk_vere_diff(log_u)) ) + // for U3E_VER2, we always need to perform rollover + // this creates a new epoch in U3E_VER3 format while keeping the old one + if ( log_u->dun_d == u3A->eve_d ) { + fprintf(stderr, "disk: rolling over to new U3E_VER3 epoch\r\n"); + if ( c3n == _disk_epoc_roll(log_u, log_u->dun_d) ) { + fprintf(stderr, "disk: failed to roll over epoch\r\n"); + exit(1); + } + fprintf(stderr, "disk: epoch v3 migration done\r\n"); + return _epoc_good; + } + + if ( (u3C.wag_w & u3o_yolo) + || (!log_u->epo_d && log_u->dun_d && !u3A->eve_d) ) { + // ok to proceed without rollover in special cases return _epoc_good; } else if ( log_u->dun_d != u3A->eve_d ) { @@ -1882,6 +1944,28 @@ _disk_epoc_load(u3_disk* log_u, c3_d lat_d, u3_disk_load_e lod_e) fprintf(stderr, "disk: failed to initialize epoch\r\n"); exit(1); } + + fprintf(stderr, "disk: epoch v3 migration done\r\n"); + return _epoc_good; + } break; + + case U3E_VER3: { + u3m_boot(log_u->dir_u->pax_c, (size_t)1 << u3_Host.ops_u.lom_y); // XX confirm + + if ( log_u->dun_d < u3A->eve_d ) { + // XX bad, add to enum + fprintf(stderr, "mars: corrupt pier, snapshot (%" PRIu64 + ") from future (log=%" PRIu64 ")\r\n", + u3A->eve_d, log_u->dun_d); + exit(1); + } + else if ( u3A->eve_d < log_u->epo_d ) { + // XX goto full replay + fprintf(stderr, "mars: corrupt pier, snapshot (%" PRIu64 + ") out of epoch (%" PRIu64 ")\r\n", + u3A->eve_d, log_u->epo_d); + exit(1); + } return _epoc_good; } break; @@ -2039,11 +2123,11 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) return log_u; } - // read metadata (version) from old log / top-level + // read metadata (version) from top-level lmdb // { u3_meta met_u; - if ( (0 == (log_u->mdb_u = u3_lmdb_init(log_c, u3_Host.ops_u.siz_i))) + if ( (0 == (log_u->mdb_u = u3_lmdb_init(log_c, 1ULL << 30))) || (c3n == u3_disk_read_meta(log_u->mdb_u, &met_u)) ) { fprintf(stderr, "disk: failed to read metadata\r\n"); @@ -2063,14 +2147,18 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) fprintf(stderr, "migration required, replay disallowed\r\n"); exit(1); } - _disk_migrate_old(log_u); + // for old ships, also open the top-level lmdb file for metadata + if ( 0 == (log_u->mdb_u = u3_lmdb_init(log_c, 1ULL << 30)) ) { + fprintf(stderr, "disk: failed to open old book\r\n"); + c3_free(log_u); // XX leaks dire(s) + return 0; + } log_u->liv_o = c3y; return log_u; } - // close top-level lmdb - u3_lmdb_exit(log_u->mdb_u); - log_u->mdb_u = 0; + // keep top-level lmdb metadata environment open for later access + // (txt_u will be initialized for the epoch next) // get latest epoch number c3_d lat_d; @@ -2118,7 +2206,7 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) return 0; } - fprintf(stderr, "disk: latest epoch is 0i%" PRIc3_d " is bogus; " + fprintf(stderr, "disk: latest epoch 0i%" PRIc3_d " is bogus; " "falling back to previous at 0i%" PRIc3_d "\r\n", lat_d, sot_d[1]); @@ -2144,4 +2232,6 @@ u3_disk_load(c3_c* pax_c, u3_disk_load_e lod_e) } } } + + return log_u; } diff --git a/pkg/vere/lmdb_tests.c b/pkg/vere/lmdb_tests.c new file mode 100644 index 0000000000..03778bc4dd --- /dev/null +++ b/pkg/vere/lmdb_tests.c @@ -0,0 +1,447 @@ +#include "db/lmdb.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define _alloc(sz) malloc(sz) +#define _free(ptr) free(ptr) + +// default mmap size for lmdb (2GB) +#define LMDB_MAP_SIZE (1ULL << 31) + +/* _test_make_tmpdir(): create unique temporary directory for lmdb. +** +** creates /tmp/lmdb_test_XXXXXX and returns the path. +** returns: heap-allocated path (caller must free) +*/ +static c3_c* +_test_make_tmpdir(void) +{ + c3_c pat_c[] = "/tmp/lmdb_test_XXXXXX"; + c3_c* dir_c = mkdtemp(pat_c); + + if ( !dir_c ) { + fprintf(stderr, "lmdb_test: mkdtemp failed: %s\r\n", strerror(errno)); + return 0; + } + + c3_c* ret_c = _alloc(strlen(dir_c) + 1); + strcpy(ret_c, dir_c); + return ret_c; +} + +/* _test_rm_rf(): recursively remove directory contents. +** +** expects path like /tmp/lmdb_test_XXXXXX +** removes the directory and all contents +*/ +static void +_test_rm_rf(const c3_c* pax_c) +{ + if ( !pax_c || strncmp(pax_c, "/tmp", 4) != 0 ) { + fprintf(stderr, "lmdb_test: refusing to remove non-/tmp path: %s\r\n", pax_c); + exit(1); + } + + c3_c cmd_c[8192]; + snprintf(cmd_c, sizeof(cmd_c), "rm -rf %s", pax_c); + system(cmd_c); +} + +//============================================================================== +// Benchmarks +//============================================================================== + +/* _bench_make_event(): create a dummy event of specified size. +** +** creates a buffer filled with a pattern based on the event number. +** +** returns: heap-allocated buffer (caller must free) +*/ +static c3_y* +_bench_make_event(c3_z siz_z, c3_d eve_d) +{ + c3_y* buf_y = _alloc(siz_z); + + // mug: simple hash from event number + c3_w mug_w = (c3_w)(eve_d * 0x12345678); + memcpy(buf_y, &mug_w, 4); + + // fill remaining bytes with pattern + for ( c3_z i = 4; i < siz_z; i++ ) { + buf_y[i] = (c3_y)((eve_d + i) & 0xFF); + } + + return buf_y; +} + +/* _bench_get_time_ns(): get current time in nanoseconds. +*/ +static c3_d +_bench_get_time_ns(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (c3_d)ts.tv_sec * 1000000000ULL + (c3_d)ts.tv_nsec; +} + +/* _bench_write_speed(): benchmark write performance. +** +** writes [num_d] events of [siz_z] bytes each, one at a time. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed(c3_d num_d, c3_z siz_z) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + MDB_env* env_u = u3_lmdb_init(dir_c, LMDB_MAP_SIZE); + + if ( !env_u ) { + fprintf(stderr, " write_speed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffer (reuse for all writes) + c3_y* evt_y = _bench_make_event(siz_z, 1); + + // start timing + c3_d beg_d = _bench_get_time_ns(); + + // write events one at a time (single-event transactions) + for ( c3_d i = 0; i < num_d; i++ ) { + // update event data pattern for variety + c3_w mug_w = (c3_w)((i + 1) * 0x12345678); + memcpy(evt_y, &mug_w, 4); + + void* byt_p[1] = { evt_y }; + size_t siz_i[1] = { siz_z }; + + c3_o sav_o = u3_lmdb_save(env_u, i + 1, 1, byt_p, siz_i); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed: save failed at event %" PRIu64 "\r\n", i + 1); + ret_i = 0; + _free(evt_y); + goto cleanup; + } + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; // elapsed nanoseconds + + // calculate metrics + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (single-event writes):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + + _free(evt_y); + u3_lmdb_exit(env_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _bench_write_speed_batched(): benchmark batched write performance. +** +** writes [num_d] events of [siz_z] bytes in batches of [bat_d]. +** reports total time, events/sec, MB/s, and per-event latency. +*/ +static c3_i +_bench_write_speed_batched(c3_d num_d, c3_z siz_z, c3_d bat_d) +{ + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + MDB_env* env_u = u3_lmdb_init(dir_c, LMDB_MAP_SIZE); + + if ( !env_u ) { + fprintf(stderr, " write_speed_batched: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // allocate batch arrays + c3_y** evt_y = _alloc(bat_d * sizeof(c3_y*)); + void** byt_p = _alloc(bat_d * sizeof(void*)); + size_t* siz_i = _alloc(bat_d * sizeof(size_t)); + + // pre-allocate event buffers for batch + for ( c3_d i = 0; i < bat_d; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // start timing + c3_d start_d = _bench_get_time_ns(); + + // write events in batches + c3_d written_d = 0; + while ( written_d < num_d ) { + c3_d remaining = num_d - written_d; + c3_d batch_size = (remaining < bat_d) ? remaining : bat_d; + + // update event data patterns + for ( c3_d i = 0; i < batch_size; i++ ) { + c3_w mug_w = (c3_w)((written_d + i + 1) * 0x12345678); + memcpy(evt_y[i], &mug_w, 4); + } + + c3_o sav_o = u3_lmdb_save(env_u, written_d + 1, batch_size, byt_p, siz_i); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_batched: save failed at event %" PRIu64 "\r\n", + written_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + written_d += batch_size; + } + + // end timing + c3_d end_d = _bench_get_time_ns(); + c3_d elapsed_ns = end_d - start_d; + + // calculate metrics + double elapsed_sec = (double)elapsed_ns / 1e9; + double events_per_sec = (double)num_d / elapsed_sec; + double total_bytes = (double)num_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)elapsed_ns / 1000.0) / (double)num_d; + + // report results + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (batched writes, batch=%" PRIu64 "):\r\n", bat_d); + fprintf(stderr, " events written: %" PRIu64 "\r\n", num_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < bat_d; i++ ) { + _free(evt_y[i]); + } + _free(evt_y); + _free(byt_p); + _free(siz_i); + + u3_lmdb_exit(env_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_batched_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +/* _bench_write_speed_mixed(): benchmark mixed batch-size write performance. +** +** writes [num_d] events of [siz_z] bytes using a realistic distribution +** of batch sizes (1-9), interleaved via deterministic PRNG. +** reports total time, events/sec, MB/s, per-event latency, and save calls. +*/ +static c3_i +_bench_write_speed_mixed(c3_d num_d, c3_z siz_z) +{ + // batch size distribution from production telemetry + // + static const c3_d bat_d[9] = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + static const c3_d cnt_d[9] = { + 2128433, 407761, 234541, 89359, 41390, 21376, 10945, 5399, 5466 + }; + + // compute original total events for scaling + // + c3_d ori_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + ori_d += bat_d[i] * cnt_d[i]; + } + + // scale counts proportionally to num_d + // + c3_d rem_d[9]; + c3_d tot_d = 0; + for ( c3_d i = 0; i < 9; i++ ) { + rem_d[i] = (cnt_d[i] * num_d) / ori_d; + if ( (0 == rem_d[i]) && (cnt_d[i] > 0) ) { + rem_d[i] = 1; + } + tot_d += rem_d[i]; + } + + c3_c* dir_c = _test_make_tmpdir(); + if ( !dir_c ) return 0; + + c3_i ret_i = 1; + MDB_env* env_u = u3_lmdb_init(dir_c, LMDB_MAP_SIZE); + + if ( !env_u ) { + fprintf(stderr, " write_speed_mixed: init failed\r\n"); + ret_i = 0; + goto cleanup; + } + + // pre-allocate event buffers for max batch size (9) + // + c3_y* evt_y[9]; + void* byt_p[9]; + size_t siz_i[9]; + + for ( c3_d i = 0; i < 9; i++ ) { + evt_y[i] = _bench_make_event(siz_z, i + 1); + byt_p[i] = evt_y[i]; + siz_i[i] = siz_z; + } + + // deterministic xorshift32 PRNG + // + c3_w rng_w = 12345; + + c3_d wit_d = 0; // events written + c3_d cal_d = 0; // save calls made + + // start timing + // + c3_d beg_d = _bench_get_time_ns(); + + while ( tot_d > 0 ) { + // xorshift32 step + // + rng_w ^= rng_w << 13; + rng_w ^= rng_w >> 17; + rng_w ^= rng_w << 5; + + // weighted selection from remaining counts + // + c3_d pick = (c3_d)rng_w % tot_d; + c3_d acc = 0; + c3_d idx = 0; + + for ( idx = 0; idx < 9; idx++ ) { + acc += rem_d[idx]; + if ( pick < acc ) break; + } + + c3_d bsz = bat_d[idx]; + rem_d[idx]--; + tot_d--; + + // update mug patterns in event buffers + // + for ( c3_d j = 0; j < bsz; j++ ) { + c3_w mug_w = (c3_w)((wit_d + j + 1) * 0x12345678); + memcpy(evt_y[j], &mug_w, 4); + } + + c3_o sav_o = u3_lmdb_save(env_u, wit_d + 1, bsz, byt_p, siz_i); + if ( c3n == sav_o ) { + fprintf(stderr, " write_speed_mixed: save failed at event %" PRIu64 "\r\n", + wit_d + 1); + ret_i = 0; + goto cleanup_buffers; + } + + wit_d += bsz; + cal_d++; + } + + // end timing + // + c3_d end_d = _bench_get_time_ns(); + c3_d lap_d = end_d - beg_d; + + // calculate metrics + // + double elapsed_sec = (double)lap_d / 1e9; + double events_per_sec = (double)wit_d / elapsed_sec; + double total_bytes = (double)wit_d * (double)siz_z; + double mb_per_sec = (total_bytes / (1024.0 * 1024.0)) / elapsed_sec; + double us_per_event = ((double)lap_d / 1000.0) / (double)wit_d; + + // report results + // + fprintf(stderr, "\r\n"); + fprintf(stderr, " write_speed benchmark (mixed batch sizes 1-9):\r\n"); + fprintf(stderr, " events written: %" PRIu64 "\r\n", wit_d); + fprintf(stderr, " save calls: %" PRIu64 "\r\n", cal_d); + fprintf(stderr, " event size: %" PRIu64 " bytes\r\n", (c3_d)siz_z); + fprintf(stderr, " total data: %.2f MB\r\n", total_bytes / (1024.0 * 1024.0)); + fprintf(stderr, " total time: %.3f seconds\r\n", elapsed_sec); + fprintf(stderr, " write speed: %.0f events/sec\r\n", events_per_sec); + fprintf(stderr, " throughput: %.2f MB/sec\r\n", mb_per_sec); + fprintf(stderr, " latency: %.1f us/event\r\n", us_per_event); + fprintf(stderr, "\r\n"); + +cleanup_buffers: + for ( c3_d i = 0; i < 9; i++ ) { + _free(evt_y[i]); + } + + u3_lmdb_exit(env_u); + +cleanup: + _test_rm_rf(dir_c); + _free(dir_c); + + fprintf(stderr, " write_speed_mixed_benchmark: %s\r\n", ret_i ? "ok" : "FAILED"); + return ret_i; +} + +//============================================================================== +// Main +//============================================================================== + +int +main(int argc, char* argv[]) +{ + c3_i ret_i = 1; + + // benchmarks + // ret_i &= _bench_write_speed(1000, 128); + // ret_i &= _bench_write_speed_batched(100000, 1280, 1000); + ret_i &= _bench_write_speed_mixed(10000, 128); + + fprintf(stderr, "\r\n"); + if ( ret_i ) { + fprintf(stderr, "lmdb_tests: ok\n"); + return 0; + } + else { + fprintf(stderr, "lmdb_tests: failed\n"); + return 1; + } +} diff --git a/pkg/vere/main.c b/pkg/vere/main.c index 879e173e03..e9b71c403f 100644 --- a/pkg/vere/main.c +++ b/pkg/vere/main.c @@ -1519,7 +1519,7 @@ _cw_info(c3_i argc, c3_c* argv[]) fprintf(stderr, "\r\n"); } - u3_lmdb_stat(log_u->mdb_u, stdout); + u3_book_stat(log_u->txt_u->pax_c); u3_disk_exit(log_u); u3m_stop(); diff --git a/pkg/vere/mars.c b/pkg/vere/mars.c index 817ac849fe..1507c40e12 100644 --- a/pkg/vere/mars.c +++ b/pkg/vere/mars.c @@ -1949,7 +1949,10 @@ u3_mars_boot(u3_mars* mar_u, c3_d len_d, c3_y* hun_y) exit(1); // XX cleanup } - if ( c3n == u3_disk_save_meta(log_u->mdb_u, &met_u) ) { + if ( c3n == u3_book_save_meta(log_u->txt_u, "version", sizeof(c3_w), (c3_y*)&met_u.ver_w) + || c3n == u3_book_save_meta(log_u->txt_u, "who", sizeof(met_u.who_d), (c3_y*)met_u.who_d) + || c3n == u3_book_save_meta(log_u->txt_u, "fake", sizeof(c3_o), (c3_y*)&met_u.fak_o) + || c3n == u3_book_save_meta(log_u->txt_u, "life", sizeof(c3_w), (c3_y*)&met_u.lif_w) ) { exit(1); // XX cleanup } diff --git a/pkg/vere/vere.h b/pkg/vere/vere.h index 4c001bbebb..fc17bfe7ac 100644 --- a/pkg/vere/vere.h +++ b/pkg/vere/vere.h @@ -10,6 +10,7 @@ #include "c3/c3.h" #include "db/lmdb.h" +#include "db/book.h" #include "noun.h" #include "uv.h" #include @@ -533,7 +534,8 @@ c3_i lok_i; // lockfile c3_o liv_o; // live c3_w ver_w; // version (see version.h) - void* mdb_u; // lmdb env of current epoch + MDB_env* mdb_u; // lmdb env for top-level metadata + u3_book* txt_u; // book env of current epoch c3_d sen_d; // commit requested c3_d dun_d; // committed c3_d epo_d; // current epoch number @@ -875,12 +877,12 @@ void u3_disk_exit(u3_disk* log_u); - /* u3_disk_read_meta(): read metadata. + /* u3_disk_read_meta(): read metadata from lmdb. */ c3_o u3_disk_read_meta(MDB_env* mdb_u, u3_meta* met_u); - /* u3_disk_save_meta(): save metadata. + /* u3_disk_save_meta(): save metadata to lmdb. */ c3_o u3_disk_save_meta(MDB_env* mdb_u, const u3_meta* met_u);