From 991bf4d618e82fb39da61115a772a62ab2bc5823 Mon Sep 17 00:00:00 2001 From: nabbilkhan Date: Tue, 3 Mar 2026 19:36:51 +0000 Subject: [PATCH 1/2] Harden token dataset validation across all training pipelines --- training/Makefile | 43 +++++++----- training/README.md | 6 +- training/data_validation.h | 65 +++++++++++++++++ training/test_data_validation.c | 112 ++++++++++++++++++++++++++++++ training/train_large.m | 38 +++++++--- training/train_large_ane.m | 18 ++++- training/training_dynamic/train.m | 42 +++++++---- 7 files changed, 280 insertions(+), 44 deletions(-) create mode 100644 training/data_validation.h create mode 100644 training/test_data_validation.c diff --git a/training/Makefile b/training/Makefile index 7f16c1a..22bff18 100644 --- a/training/Makefile +++ b/training/Makefile @@ -1,9 +1,11 @@ -CC = xcrun clang -CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc -FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface -LDFLAGS = $(FRAMEWORKS) -ldl - -HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h +CC = xcrun clang +CFLAGS = -O2 -Wall -Wno-deprecated-declarations -fobjc-arc +CC_C = xcrun clang +CFLAGS_C = -O2 -Wall -Wextra -Werror -std=c11 +FRAMEWORKS = -framework Foundation -framework CoreML -framework IOSurface +LDFLAGS = $(FRAMEWORKS) -ldl + +HEADERS_LARGE = stories_config.h stories_io.h stories_mil.h stories_cpu_ops.h data_validation.h HEADERS_ANE = $(HEADERS_LARGE) ane_rmsnorm_bwd.h ane_classifier.h @@ -33,16 +35,21 @@ test_perf_stats: test_perf_stats.m test_qos_sweep: test_qos_sweep.m $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) -test_ane_advanced: test_ane_advanced.m - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -probes: $(PROBES) - -tokenize: - python3 tokenize.py - -clean: - rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier - -.PHONY: clean tokenize probes +test_ane_advanced: test_ane_advanced.m + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +test_data_validation: test_data_validation.c data_validation.h + $(CC_C) $(CFLAGS_C) -o $@ $< + +probes: $(PROBES) + +security-tests: test_data_validation + +tokenize: + python3 tokenize.py + +clean: + rm -f train train_large train_large_ane $(PROBES) test_rmsnorm_bwd test_classifier test_data_validation + +.PHONY: clean tokenize probes security-tests diff --git a/training/README.md b/training/README.md index 8ccde88..6b4d1bc 100644 --- a/training/README.md +++ b/training/README.md @@ -78,7 +78,11 @@ Weights passed via IOSurface spatial dimension — compile 9 kernels once at sta bash download_data.sh ``` -Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens). +Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens). + +All training pipelines perform token-data validation at startup: +- token file must contain at least `SEQ+1` tokens +- every token id must be within `[0, vocab_size)` ### 2. Build & Train diff --git a/training/data_validation.h b/training/data_validation.h new file mode 100644 index 0000000..8958ecc --- /dev/null +++ b/training/data_validation.h @@ -0,0 +1,65 @@ +// data_validation.h — Shared token-data validation helpers +#pragma once +#include +#include +#include + +typedef enum { + TOKEN_DATA_VALID = 0, + TOKEN_DATA_ERR_TOO_SHORT = 1, + TOKEN_DATA_ERR_OOB_TOKEN = 2 +} TokenDataValidationCode; + +typedef struct { + size_t required_tokens; + size_t bad_index; + uint16_t bad_token; +} TokenDataValidationError; + +static inline bool token_data_has_min_tokens(size_t n_tokens, int seq, size_t *required_tokens) { + if (seq < 0) return false; + size_t needed = (size_t)seq + 1; + if (required_tokens) *required_tokens = needed; + return n_tokens >= needed; +} + +static inline bool token_data_find_oob_token(const uint16_t *token_data, size_t n_tokens, int vocab, + size_t *bad_index, uint16_t *bad_token) { + if (!token_data || n_tokens == 0 || vocab <= 0) return false; + for (size_t i = 0; i < n_tokens; i++) { + if ((int)token_data[i] >= vocab) { + if (bad_index) *bad_index = i; + if (bad_token) *bad_token = token_data[i]; + return true; + } + } + return false; +} + +static inline TokenDataValidationCode token_data_validate(const uint16_t *token_data, size_t n_tokens, + int seq, int vocab, + TokenDataValidationError *err) { + if (err) { + err->required_tokens = 0; + err->bad_index = 0; + err->bad_token = 0; + } + + size_t required = 0; + if (!token_data_has_min_tokens(n_tokens, seq, &required)) { + if (err) err->required_tokens = required; + return TOKEN_DATA_ERR_TOO_SHORT; + } + + size_t bad_index = 0; + uint16_t bad_token = 0; + if (token_data_find_oob_token(token_data, n_tokens, vocab, &bad_index, &bad_token)) { + if (err) { + err->bad_index = bad_index; + err->bad_token = bad_token; + } + return TOKEN_DATA_ERR_OOB_TOKEN; + } + + return TOKEN_DATA_VALID; +} diff --git a/training/test_data_validation.c b/training/test_data_validation.c new file mode 100644 index 0000000..028d561 --- /dev/null +++ b/training/test_data_validation.c @@ -0,0 +1,112 @@ +// test_data_validation.c — Unit tests for token-data hardening helpers +#include +#include +#include +#include + +#include "data_validation.h" + +typedef struct { + int passed; + int failed; +} TestStats; + +#define CHECK_TRUE(stats, cond, msg) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "FAIL: %s (%s:%d)\n", msg, __FILE__, __LINE__); \ + (stats)->failed++; \ + return; \ + } \ + } while (0) + +#define CHECK_EQ_INT(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg) +#define CHECK_EQ_SIZE(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg) + +static void test_min_tokens_boundary(TestStats *stats) { + size_t required = 0; + CHECK_TRUE(stats, token_data_has_min_tokens(257, 256, &required), "257 tokens should satisfy seq=256"); + CHECK_EQ_SIZE(stats, required, 257, "required tokens should be seq+1"); + stats->passed++; +} + +static void test_min_tokens_short(TestStats *stats) { + size_t required = 0; + CHECK_TRUE(stats, !token_data_has_min_tokens(256, 256, &required), "256 tokens should fail seq=256"); + CHECK_EQ_SIZE(stats, required, 257, "required tokens should still be seq+1"); + stats->passed++; +} + +static void test_validate_too_short(TestStats *stats) { + uint16_t tokens[2] = {1, 2}; + TokenDataValidationError err = {0}; + TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err); + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short dataset should fail"); + CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should be reported"); + stats->passed++; +} + +static void test_validate_oob_first(TestStats *stats) { + uint16_t tokens[6] = {32000, 1, 2, 3, 4, 5}; + TokenDataValidationError err = {0}; + TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err); + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "first token OOB should fail"); + CHECK_EQ_SIZE(stats, err.bad_index, 0, "bad index should point to first token"); + CHECK_EQ_INT(stats, err.bad_token, 32000, "bad token value should be reported"); + stats->passed++; +} + +static void test_validate_oob_middle(TestStats *stats) { + uint16_t tokens[7] = {1, 2, 3, 65535, 4, 5, 6}; + TokenDataValidationError err = {0}; + TokenDataValidationCode code = token_data_validate(tokens, 7, 4, 32000, &err); + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "middle token OOB should fail"); + CHECK_EQ_SIZE(stats, err.bad_index, 3, "bad index should point to middle token"); + CHECK_EQ_INT(stats, err.bad_token, 65535, "bad token value should be reported"); + stats->passed++; +} + +static void test_validate_oob_last(TestStats *stats) { + uint16_t tokens[6] = {1, 2, 3, 4, 5, 40000}; + TokenDataValidationError err = {0}; + TokenDataValidationCode code = token_data_validate(tokens, 6, 4, 32000, &err); + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_OOB_TOKEN, "last token OOB should fail"); + CHECK_EQ_SIZE(stats, err.bad_index, 5, "bad index should point to last token"); + CHECK_EQ_INT(stats, err.bad_token, 40000, "bad token value should be reported"); + stats->passed++; +} + +static void test_validate_ok(TestStats *stats) { + uint16_t tokens[8] = {0, 1, 2, 3, 4, 5, 31998, 31999}; + TokenDataValidationError err; + memset(&err, 0xA5, sizeof(err)); + TokenDataValidationCode code = token_data_validate(tokens, 8, 4, 32000, &err); + CHECK_EQ_INT(stats, code, TOKEN_DATA_VALID, "valid dataset should pass"); + stats->passed++; +} + +static void test_find_oob_empty(TestStats *stats) { + size_t bad_index = 123; + uint16_t bad_token = 456; + CHECK_TRUE(stats, !token_data_find_oob_token(NULL, 0, 32000, &bad_index, &bad_token), + "empty dataset should not report OOB token"); + CHECK_EQ_SIZE(stats, bad_index, 123, "bad index should remain unchanged for empty input"); + CHECK_EQ_INT(stats, bad_token, 456, "bad token should remain unchanged for empty input"); + stats->passed++; +} + +int main(void) { + TestStats stats = {0, 0}; + + test_min_tokens_boundary(&stats); + test_min_tokens_short(&stats); + test_validate_too_short(&stats); + test_validate_oob_first(&stats); + test_validate_oob_middle(&stats); + test_validate_oob_last(&stats); + test_validate_ok(&stats); + test_find_oob_empty(&stats); + + printf("test_data_validation: %d passed, %d failed\n", stats.passed, stats.failed); + return stats.failed == 0 ? 0 : 1; +} diff --git a/training/train_large.m b/training/train_large.m index 17fb1c5..6894136 100644 --- a/training/train_large.m +++ b/training/train_large.m @@ -1,9 +1,10 @@ // train_large.m — Train stories110M (12 layers, 768dim, 3072hidden) on ANE // Uses pretokenized TinyStories data with cross-entropy loss // 5 weight-bearing ANE kernels per layer × 12 layers = 60 per compile batch -#include "stories_io.h" -#include "stories_mil.h" -#include "stories_cpu_ops.h" +#include "stories_io.h" +#include "stories_mil.h" +#include "stories_cpu_ops.h" +#include "data_validation.h" #define CKPT_PATH_DEFAULT "ane_stories110M_ckpt.bin" #define MODEL_PATH_DEFAULT "stories110M.bin" @@ -283,14 +284,29 @@ int main(int argc, char *argv[]) { } // mmap token data - int data_fd = open(DATA_PATH, O_RDONLY); - if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; } - struct stat st; fstat(data_fd, &st); - size_t data_len = st.st_size; - uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); - if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; } - size_t n_tokens = data_len / 2; - printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); + int data_fd = open(DATA_PATH, O_RDONLY); + if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; } + struct stat st; fstat(data_fd, &st); + size_t data_len = st.st_size; + uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); + if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; } + size_t n_tokens = data_len / 2; + printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); + + TokenDataValidationError data_err = {0}; + TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err); + if (data_code == TOKEN_DATA_ERR_TOO_SHORT) { + fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n", + data_err.required_tokens, n_tokens); + munmap(token_data, data_len); close(data_fd); + return 1; + } + if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) { + fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n", + data_err.bad_token, data_err.bad_index, VOCAB); + munmap(token_data, data_len); close(data_fd); + return 1; + } // Gradient buffers shared across layers (reused each step) float *dy = (float*)malloc(SEQ*DIM*4); // gradient flowing backward diff --git a/training/train_large_ane.m b/training/train_large_ane.m index ba9dfe7..ab2b2c9 100644 --- a/training/train_large_ane.m +++ b/training/train_large_ane.m @@ -13,6 +13,7 @@ #include "stories_io.h" #include "stories_mil.h" #include "stories_cpu_ops.h" +#include "data_validation.h" #include "ane_rmsnorm_bwd.h" #include "ane_classifier.h" @@ -276,10 +277,25 @@ int main(int argc, char *argv[]) { struct stat st; fstat(data_fd, &st); size_t data_len = st.st_size; uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); - if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; } + if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; } size_t n_tokens = data_len / 2; printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); + TokenDataValidationError data_err = {0}; + TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err); + if (data_code == TOKEN_DATA_ERR_TOO_SHORT) { + fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n", + data_err.required_tokens, n_tokens); + munmap(token_data, data_len); close(data_fd); + return 1; + } + if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) { + fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n", + data_err.bad_token, data_err.bad_index, VOCAB); + munmap(token_data, data_len); close(data_fd); + return 1; + } + // Gradient buffers float *dy = (float*)malloc(SEQ*DIM*4); float *dffn = (float*)malloc(SEQ*DIM*4); diff --git a/training/training_dynamic/train.m b/training/training_dynamic/train.m index 412c4d8..8b8b289 100644 --- a/training/training_dynamic/train.m +++ b/training/training_dynamic/train.m @@ -1,8 +1,9 @@ // train.m — Dynamic weight ANE training for Stories110M // Compile kernels ONCE at startup, update weights via IOSurface every step. // No exec() restart needed — eliminates 76% compile overhead. -#include "mil_dynamic.h" -#include "cpu_ops.h" +#include "mil_dynamic.h" +#include "cpu_ops.h" +#include "../data_validation.h" #define CKPT_PATH "ane_stories110M_dyn_ckpt.bin" #define MODEL_PATH "../../../assets/models/stories110M.bin" @@ -333,17 +334,32 @@ int main(int argc, char *argv[]) { } // mmap token data - int data_fd = open(DATA_PATH, O_RDONLY); - if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; } - struct stat st; fstat(data_fd, &st); - size_t data_len = st.st_size; - uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); - if (token_data == MAP_FAILED) { printf("mmap failed\n"); return 1; } - size_t n_tokens = data_len / 2; - printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); - - // Vocab compaction: map 32K sparse vocab → ~9K compact - VocabMap vm = vocab_map_build(token_data, n_tokens, VOCAB); + int data_fd = open(DATA_PATH, O_RDONLY); + if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; } + struct stat st; fstat(data_fd, &st); + size_t data_len = st.st_size; + uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); + if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; } + size_t n_tokens = data_len / 2; + printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); + + TokenDataValidationError data_err = {0}; + TokenDataValidationCode data_code = token_data_validate(token_data, n_tokens, SEQ, VOCAB, &data_err); + if (data_code == TOKEN_DATA_ERR_TOO_SHORT) { + fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n", + data_err.required_tokens, n_tokens); + munmap(token_data, data_len); close(data_fd); + return 1; + } + if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) { + fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n", + data_err.bad_token, data_err.bad_index, VOCAB); + munmap(token_data, data_len); close(data_fd); + return 1; + } + + // Vocab compaction: map 32K sparse vocab → ~9K compact + VocabMap vm = vocab_map_build(token_data, n_tokens, VOCAB); int CV = vm.compact_vocab; printf("Vocab compaction: %d → %d active tokens (%.1fx reduction)\n", VOCAB, CV, (float)VOCAB/CV); From 60b0512be32bf88f096481ac105f5558c0afa631 Mon Sep 17 00:00:00 2001 From: nabbilkhan Date: Tue, 3 Mar 2026 19:42:33 +0000 Subject: [PATCH 2/2] Harden token file layout checks and prevent exec-time fd leaks --- training/README.md | 1 + training/data_validation.h | 8 ++ training/test_data_validation.c | 127 ++++++++++++++++++++++++++++++ training/train_large.m | 53 ++++++++----- training/train_large_ane.m | 28 +++++-- training/training_dynamic/train.m | 42 ++++++---- 6 files changed, 218 insertions(+), 41 deletions(-) diff --git a/training/README.md b/training/README.md index 6b4d1bc..d9de619 100644 --- a/training/README.md +++ b/training/README.md @@ -81,6 +81,7 @@ bash download_data.sh Downloads pretokenized TinyStories (Llama 2 BPE, 32K vocab) from HuggingFace. Produces `tinystories_data00.bin` (~41 MB, ~20M tokens). All training pipelines perform token-data validation at startup: +- token file byte length must align to 16-bit token boundaries - token file must contain at least `SEQ+1` tokens - every token id must be within `[0, vocab_size)` diff --git a/training/data_validation.h b/training/data_validation.h index 8958ecc..63e016a 100644 --- a/training/data_validation.h +++ b/training/data_validation.h @@ -16,6 +16,14 @@ typedef struct { uint16_t bad_token; } TokenDataValidationError; +// Token files are 16-bit ids. Return false when byte length is misaligned. +static inline bool token_data_bytes_to_token_count(size_t n_bytes, size_t *n_tokens, size_t *extra_bytes) { + size_t rem = n_bytes % sizeof(uint16_t); + if (n_tokens) *n_tokens = n_bytes / sizeof(uint16_t); + if (extra_bytes) *extra_bytes = rem; + return rem == 0; +} + static inline bool token_data_has_min_tokens(size_t n_tokens, int seq, size_t *required_tokens) { if (seq < 0) return false; size_t needed = (size_t)seq + 1; diff --git a/training/test_data_validation.c b/training/test_data_validation.c index 028d561..8d57068 100644 --- a/training/test_data_validation.c +++ b/training/test_data_validation.c @@ -23,6 +23,39 @@ typedef struct { #define CHECK_EQ_INT(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg) #define CHECK_EQ_SIZE(stats, got, want, msg) CHECK_TRUE((stats), (got) == (want), msg) +static uint32_t lcg_next(uint32_t *state) { + *state = (*state * 1664525u) + 1013904223u; + return *state; +} + +static void test_bytes_to_token_count_even(TestStats *stats) { + size_t n_tokens = 0; + size_t extra = 99; + CHECK_TRUE(stats, token_data_bytes_to_token_count(1024, &n_tokens, &extra), + "even byte length should map to token count"); + CHECK_EQ_SIZE(stats, n_tokens, 512, "1024 bytes should map to 512 tokens"); + CHECK_EQ_SIZE(stats, extra, 0, "even byte length should have zero remainder"); + stats->passed++; +} + +static void test_bytes_to_token_count_odd(TestStats *stats) { + size_t n_tokens = 0; + size_t extra = 0; + CHECK_TRUE(stats, !token_data_bytes_to_token_count(1025, &n_tokens, &extra), + "odd byte length should fail alignment check"); + CHECK_EQ_SIZE(stats, n_tokens, 512, "odd byte length should still report floor token count"); + CHECK_EQ_SIZE(stats, extra, 1, "1025 bytes should report one extra byte"); + stats->passed++; +} + +static void test_bytes_to_token_count_null_outputs(TestStats *stats) { + CHECK_TRUE(stats, token_data_bytes_to_token_count(8, NULL, NULL), + "alignment helper should work with null output pointers"); + CHECK_TRUE(stats, !token_data_bytes_to_token_count(9, NULL, NULL), + "alignment helper should fail odd byte length with null outputs"); + stats->passed++; +} + static void test_min_tokens_boundary(TestStats *stats) { size_t required = 0; CHECK_TRUE(stats, token_data_has_min_tokens(257, 256, &required), "257 tokens should satisfy seq=256"); @@ -37,6 +70,13 @@ static void test_min_tokens_short(TestStats *stats) { stats->passed++; } +static void test_min_tokens_negative_seq(TestStats *stats) { + size_t required = 777; + CHECK_TRUE(stats, !token_data_has_min_tokens(10, -1, &required), "negative seq should fail min-token check"); + CHECK_EQ_SIZE(stats, required, 777, "required token out param should remain unchanged for invalid seq"); + stats->passed++; +} + static void test_validate_too_short(TestStats *stats) { uint16_t tokens[2] = {1, 2}; TokenDataValidationError err = {0}; @@ -46,6 +86,22 @@ static void test_validate_too_short(TestStats *stats) { stats->passed++; } +static void test_validate_too_short_precedes_oob(TestStats *stats) { + uint16_t tokens[2] = {65000, 1}; + TokenDataValidationError err = {0}; + TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, &err); + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "too-short check should happen before OOB check"); + CHECK_EQ_SIZE(stats, err.required_tokens, 5, "required token count should still be reported"); + stats->passed++; +} + +static void test_validate_too_short_with_null_err(TestStats *stats) { + uint16_t tokens[2] = {1, 2}; + TokenDataValidationCode code = token_data_validate(tokens, 2, 4, 32000, NULL); + CHECK_EQ_INT(stats, code, TOKEN_DATA_ERR_TOO_SHORT, "validation should work when err output is null"); + stats->passed++; +} + static void test_validate_oob_first(TestStats *stats) { uint16_t tokens[6] = {32000, 1, 2, 3, 4, 5}; TokenDataValidationError err = {0}; @@ -85,6 +141,20 @@ static void test_validate_ok(TestStats *stats) { stats->passed++; } +static void test_validate_vocab_boundary(TestStats *stats) { + uint16_t valid_tokens[3] = {0, 0, 0}; + TokenDataValidationError err = {0}; + TokenDataValidationCode valid_code = token_data_validate(valid_tokens, 3, 2, 1, &err); + CHECK_EQ_INT(stats, valid_code, TOKEN_DATA_VALID, "token 0 should be valid when vocab=1"); + + uint16_t invalid_tokens[3] = {0, 1, 0}; + TokenDataValidationCode invalid_code = token_data_validate(invalid_tokens, 3, 2, 1, &err); + CHECK_EQ_INT(stats, invalid_code, TOKEN_DATA_ERR_OOB_TOKEN, "token >= vocab should fail at vocab boundary"); + CHECK_EQ_SIZE(stats, err.bad_index, 1, "boundary OOB should report exact index"); + CHECK_EQ_INT(stats, err.bad_token, 1, "boundary OOB should report offending token"); + stats->passed++; +} + static void test_find_oob_empty(TestStats *stats) { size_t bad_index = 123; uint16_t bad_token = 456; @@ -95,17 +165,74 @@ static void test_find_oob_empty(TestStats *stats) { stats->passed++; } +static void test_find_oob_null_outputs(TestStats *stats) { + uint16_t tokens[4] = {0, 1, 32000, 2}; + CHECK_TRUE(stats, token_data_find_oob_token(tokens, 4, 32000, NULL, NULL), + "OOB scan should work with null output pointers"); + stats->passed++; +} + +static void test_find_oob_invalid_vocab(TestStats *stats) { + uint16_t tokens[3] = {0, 1, 2}; + CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, 0, NULL, NULL), + "OOB scan should reject non-positive vocab"); + CHECK_TRUE(stats, !token_data_find_oob_token(tokens, 3, -1, NULL, NULL), + "OOB scan should reject negative vocab"); + stats->passed++; +} + +static void test_find_oob_randomized_consistency(TestStats *stats) { + uint32_t seed = 1; + for (int iter = 0; iter < 512; iter++) { + int vocab = (int)(lcg_next(&seed) % 128u) + 1; + size_t n_tokens = (size_t)(lcg_next(&seed) % 64u); + uint16_t tokens[64] = {0}; + + bool expected_found = false; + size_t expected_index = 0; + uint16_t expected_token = 0; + for (size_t i = 0; i < n_tokens; i++) { + tokens[i] = (uint16_t)(lcg_next(&seed) % 256u); + if (!expected_found && (int)tokens[i] >= vocab) { + expected_found = true; + expected_index = i; + expected_token = tokens[i]; + } + } + + size_t got_index = 0; + uint16_t got_token = 0; + bool got_found = token_data_find_oob_token(tokens, n_tokens, vocab, &got_index, &got_token); + CHECK_EQ_INT(stats, got_found, expected_found, "randomized OOB scan should match reference result"); + if (expected_found) { + CHECK_EQ_SIZE(stats, got_index, expected_index, "randomized OOB index should match reference"); + CHECK_EQ_INT(stats, got_token, expected_token, "randomized OOB token should match reference"); + } + } + stats->passed++; +} + int main(void) { TestStats stats = {0, 0}; + test_bytes_to_token_count_even(&stats); + test_bytes_to_token_count_odd(&stats); + test_bytes_to_token_count_null_outputs(&stats); test_min_tokens_boundary(&stats); test_min_tokens_short(&stats); + test_min_tokens_negative_seq(&stats); test_validate_too_short(&stats); + test_validate_too_short_precedes_oob(&stats); + test_validate_too_short_with_null_err(&stats); test_validate_oob_first(&stats); test_validate_oob_middle(&stats); test_validate_oob_last(&stats); test_validate_ok(&stats); + test_validate_vocab_boundary(&stats); test_find_oob_empty(&stats); + test_find_oob_null_outputs(&stats); + test_find_oob_invalid_vocab(&stats); + test_find_oob_randomized_consistency(&stats); printf("test_data_validation: %d passed, %d failed\n", stats.passed, stats.failed); return stats.failed == 0 ? 0 : 1; diff --git a/training/train_large.m b/training/train_large.m index 6894136..4850a40 100644 --- a/training/train_large.m +++ b/training/train_large.m @@ -283,14 +283,28 @@ int main(int argc, char *argv[]) { printf("ANE FLOPs/step: %.0fM (fwd+bwd_dx+sdpa_bwd) | CPU: dW+cls (cblas)\n\n", ane_f/1e6); } - // mmap token data + // mmap token data int data_fd = open(DATA_PATH, O_RDONLY); if (data_fd < 0) { printf("Cannot open %s\n", DATA_PATH); return 1; } - struct stat st; fstat(data_fd, &st); - size_t data_len = st.st_size; + struct stat st; + if (fstat(data_fd, &st) != 0) { perror("fstat"); close(data_fd); return 1; } + size_t data_len = (size_t)st.st_size; + size_t n_tokens = 0, extra_bytes = 0; + if (!token_data_bytes_to_token_count(data_len, &n_tokens, &extra_bytes)) { + fprintf(stderr, + "Token data validation failed: file size %zu bytes has %zu extra byte(s); expected 16-bit tokens\n", + data_len, extra_bytes); + close(data_fd); + return 1; + } + if (n_tokens == 0) { + fprintf(stderr, "Token data validation failed: token file is empty\n"); + close(data_fd); + return 1; + } uint16_t *token_data = (uint16_t*)mmap(NULL, data_len, PROT_READ, MAP_PRIVATE, data_fd, 0); - if (token_data == MAP_FAILED) { printf("mmap failed\n"); close(data_fd); return 1; } - size_t n_tokens = data_len / 2; + if (token_data == MAP_FAILED) { perror("mmap"); close(data_fd); return 1; } + close(data_fd); // mapping remains valid; avoid fd leaks across exec() restarts printf("Token data: %zu tokens (%.1f MB)\n", n_tokens, data_len/1e6); TokenDataValidationError data_err = {0}; @@ -298,13 +312,13 @@ int main(int argc, char *argv[]) { if (data_code == TOKEN_DATA_ERR_TOO_SHORT) { fprintf(stderr, "Token data validation failed: need at least %zu tokens (SEQ+1), got %zu\n", data_err.required_tokens, n_tokens); - munmap(token_data, data_len); close(data_fd); + munmap(token_data, data_len); return 1; } if (data_code == TOKEN_DATA_ERR_OOB_TOKEN) { fprintf(stderr, "Token data validation failed: token %u at index %zu is outside vocab [0, %d)\n", data_err.bad_token, data_err.bad_index, VOCAB); - munmap(token_data, data_len); close(data_fd); + munmap(token_data, data_len); return 1; } @@ -695,19 +709,18 @@ int main(int argc, char *argv[]) { printf("ANE utilization: %.1f%% of 15.8 TFLOPS\n", 100*ane_flops/(total_train_ms*1e9)/15.8); // Cleanup - for (int L=0; L