diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index dbc6825..9998827 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -83,7 +83,7 @@ All examples are verified by running `zig build examples-check` - `lastChild()` - `nextSibling()` - `prevSibling()` - - `children()` (borrowed `[]const u32` index view) + - `children()` (iterator of wrapped child nodes; `collect(allocator)` returns an owned `[]Node`) - Text: - `innerText(allocator)` (borrowed or allocated depending on shape) - `innerTextWithOptions(allocator, TextOptions)` @@ -193,49 +193,49 @@ Source: `bench/results/latest.json` (`stable` profile). | Fixture | ours | lol-html | lexbor | |---|---:|---:|---:| -| `rust-lang.html` | 1447.99 | 1474.65 | 332.72 | -| `wiki-html.html` | 1645.45 | 1215.04 | 271.24 | -| `mdn-html.html` | 2570.09 | 1879.00 | 404.50 | -| `w3-html52.html` | 1064.19 | 764.62 | 199.22 | -| `hn.html` | 1263.60 | 885.26 | 223.15 | -| `python-org.html` | 1549.02 | 1356.21 | 284.19 | -| `kernel-org.html` | 1440.47 | 1300.81 | 276.52 | -| `gnu-org.html` | 1917.36 | 1482.15 | 317.74 | -| `ziglang-org.html` | 1480.49 | 1257.62 | 291.72 | -| `ziglang-doc-master.html` | 1122.44 | 987.16 | 214.23 | -| `wikipedia-unicode-list.html` | 1247.00 | 1024.98 | 215.21 | -| `whatwg-html-spec.html` | 1113.73 | 841.16 | 210.83 | -| `synthetic-forms.html` | 1046.17 | 710.72 | 174.94 | -| `synthetic-table-grid.html` | 768.56 | 622.31 | 152.86 | -| `synthetic-list-nested.html` | 833.77 | 598.02 | 152.45 | -| `synthetic-comments-doctype.html` | 1200.72 | 827.66 | 212.09 | -| `synthetic-template-rich.html` | 628.02 | 444.34 | 134.10 | -| `synthetic-whitespace-noise.html` | 1104.21 | 919.69 | 170.33 | -| `synthetic-news-feed.html` | 835.27 | 577.95 | 144.46 | -| `synthetic-ecommerce.html` | 787.72 | 556.51 | 151.95 | -| `synthetic-forum-thread.html` | 839.48 | 579.84 | 143.06 | +| `rust-lang.html` | 2132.70 | 1501.81 | 324.53 | +| `wiki-html.html` | 1991.85 | 1220.28 | 267.50 | +| `mdn-html.html` | 2939.75 | 1894.70 | 408.76 | +| `w3-html52.html` | 956.25 | 754.98 | 196.30 | +| `hn.html` | 1595.94 | 893.48 | 221.75 | +| `python-org.html` | 2116.13 | 1354.79 | 280.79 | +| `kernel-org.html` | 1979.68 | 1335.72 | 290.65 | +| `gnu-org.html` | 2368.06 | 1490.41 | 313.42 | +| `ziglang-org.html` | 1874.52 | 1299.04 | 289.84 | +| `ziglang-doc-master.html` | 1431.90 | 1045.18 | 225.11 | +| `wikipedia-unicode-list.html` | 1647.71 | 1081.56 | 226.67 | +| `whatwg-html-spec.html` | 1344.31 | 892.66 | 219.37 | +| `synthetic-forms.html` | 1396.29 | 781.68 | 189.28 | +| `synthetic-table-grid.html` | 1086.37 | 724.33 | 169.46 | +| `synthetic-list-nested.html` | 1190.11 | 652.22 | 161.58 | +| `synthetic-comments-doctype.html` | 1862.19 | 943.51 | 223.61 | +| `synthetic-template-rich.html` | 913.72 | 465.79 | 143.20 | +| `synthetic-whitespace-noise.html` | 1592.74 | 1062.32 | 189.70 | +| `synthetic-news-feed.html` | 1179.12 | 646.78 | 157.61 | +| `synthetic-ecommerce.html` | 1134.48 | 637.67 | 163.21 | +| `synthetic-forum-thread.html` | 1145.80 | 643.29 | 161.14 | #### Query Match Throughput (ours) | Case | ours ops/s | ours ns/op | |---|---:|---:| -| `attr-heavy-button` | 1148936.76 | 870.37 | -| `attr-heavy-nav` | 1130790.00 | 884.34 | +| `attr-heavy-button` | 191877.12 | 5211.67 | +| `attr-heavy-nav` | 87885.90 | 11378.39 | #### Cached Query Throughput (ours) | Case | ours ops/s | ours ns/op | |---|---:|---:| -| `attr-heavy-button` | 1305257.78 | 766.13 | -| `attr-heavy-nav` | 1347173.46 | 742.29 | +| `attr-heavy-button` | 144352.80 | 6927.47 | +| `attr-heavy-nav` | 120324.49 | 8310.86 | #### Query Parse Throughput (ours) | Selector case | Ops/s | ns/op | |---|---:|---:| -| `simple` | 17335919.85 | 57.68 | -| `complex` | 5836657.49 | 171.33 | -| `grouped` | 6396371.26 | 156.34 | +| `simple` | 10465915.34 | 95.55 | +| `complex` | 4910511.63 | 203.64 | +| `grouped` | 6290595.08 | 158.97 | For full per-parser, per-fixture tables and gate output: - `bench/results/latest.md` diff --git a/README.md b/README.md index 7b6d064..e8b6007 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,9 @@ Source: `bench/results/latest.json` (`stable` profile). ### Parse Throughput (Average Across Fixtures) ```text -ours │████████████████████│ 1233.61 MB/s (100.00%) -lol-html │████████████████░░░░│ 966.94 MB/s (78.38%) -lexbor │████░░░░░░░░░░░░░░░░│ 222.74 MB/s (18.06%) +ours │████████████████████│ 1613.31 MB/s (100.00%) +lol-html │█████████████░░░░░░░│ 1015.34 MB/s (62.94%) +lexbor │███░░░░░░░░░░░░░░░░░│ 229.69 MB/s (14.24%) ``` ### Conformance Snapshot diff --git a/build.zig b/build.zig index cfb53a1..239c8e6 100644 --- a/build.zig +++ b/build.zig @@ -22,6 +22,15 @@ pub fn build(b: *std.Build) void { }), }); + const parse_mode_mod = b.createModule(.{ + .root_source_file = b.path("tools/parse_mode.zig"), + .target = target, + .optimize = optimize, + .imports = &.{ + .{ .name = "htmlparser", .module = mod }, + }, + }); + const bench_exe = b.addExecutable(.{ .name = "htmlparser-bench", .root_module = b.createModule(.{ @@ -30,6 +39,7 @@ pub fn build(b: *std.Build) void { .optimize = optimize, .imports = &.{ .{ .name = "htmlparser", .module = mod }, + .{ .name = "parse_mode", .module = parse_mode_mod }, }, }), }); @@ -111,11 +121,13 @@ pub fn build(b: *std.Build) void { const mod_tests = b.addTest(.{ .root_module = mod, + .test_runner = .{ .path = b.path("tools/test_runner.zig"), .mode = .simple }, }); const run_mod_tests = b.addRunArtifact(mod_tests); const exe_tests = b.addTest(.{ .root_module = exe.root_module, + .test_runner = .{ .path = b.path("tools/test_runner.zig"), .mode = .simple }, }); const run_exe_tests = b.addRunArtifact(exe_tests); @@ -126,8 +138,17 @@ pub fn build(b: *std.Build) void { .optimize = optimize, .imports = &.{ .{ .name = "htmlparser", .module = mod }, + .{ .name = "examples", .module = b.createModule(.{ + .root_source_file = b.path("examples/examples.zig"), + .target = target, + .optimize = optimize, + .imports = &.{ + .{ .name = "htmlparser", .module = mod }, + }, + }) }, }, }), + .test_runner = .{ .path = b.path("tools/test_runner.zig"), .mode = .simple }, }); const run_examples_tests = b.addRunArtifact(examples_tests); @@ -140,14 +161,41 @@ pub fn build(b: *std.Build) void { .{ .name = "htmlparser", .module = mod }, }, }), + .test_runner = .{ .path = b.path("tools/test_runner.zig"), .mode = .simple }, }); const run_behavioral_tests = b.addRunArtifact(behavioral_tests); + const scripts_tests = b.addTest(.{ + .root_module = b.createModule(.{ + .root_source_file = b.path("tools/scripts.zig"), + .target = target, + .optimize = optimize, + }), + .test_runner = .{ .path = b.path("tools/test_runner.zig"), .mode = .simple }, + }); + const run_scripts_tests = b.addRunArtifact(scripts_tests); + + const bench_tests = b.addTest(.{ + .root_module = b.createModule(.{ + .root_source_file = b.path("tools/bench/bench.zig"), + .target = target, + .optimize = optimize, + .imports = &.{ + .{ .name = "htmlparser", .module = mod }, + .{ .name = "parse_mode", .module = parse_mode_mod }, + }, + }), + .test_runner = .{ .path = b.path("tools/test_runner.zig"), .mode = .simple }, + }); + const run_bench_tests = b.addRunArtifact(bench_tests); + const test_step = b.step("test", "Run tests"); test_step.dependOn(&run_mod_tests.step); test_step.dependOn(&run_exe_tests.step); test_step.dependOn(&run_examples_tests.step); test_step.dependOn(&run_behavioral_tests.step); + test_step.dependOn(&run_scripts_tests.step); + test_step.dependOn(&run_bench_tests.step); const ship_check_step = b.step("ship-check", "Run release-readiness checks (test + docs + examples)"); ship_check_step.dependOn(test_step); diff --git a/examples/basic_parse_query.zig b/examples/basic_parse_query.zig index b7bbb74..b4cc22f 100644 --- a/examples/basic_parse_query.zig +++ b/examples/basic_parse_query.zig @@ -3,7 +3,7 @@ const html = @import("htmlparser"); const default_options: html.ParseOptions = .{}; const Document = default_options.GetDocument(); -fn run() !void { +pub fn run() !void { var doc = Document.init(std.testing.allocator); defer doc.deinit(); diff --git a/examples/cached_selector.zig b/examples/cached_selector.zig index 376f6f9..4840ede 100644 --- a/examples/cached_selector.zig +++ b/examples/cached_selector.zig @@ -3,7 +3,7 @@ const html = @import("htmlparser"); const default_options: html.ParseOptions = .{}; const Document = default_options.GetDocument(); -fn run() !void { +pub fn run() !void { var doc = Document.init(std.testing.allocator); defer doc.deinit(); @@ -20,7 +20,7 @@ fn run() !void { defer arena.deinit(); const sel = try html.Selector.compileRuntime(arena.allocator(), "a[href^=https][class~=button]"); - const first = doc.queryOneCached(&sel) orelse return error.TestUnexpectedResult; + const first = doc.queryOneCached(sel) orelse return error.TestUnexpectedResult; try std.testing.expectEqualStrings("a1", first.getAttributeValue("id").?); } diff --git a/examples/debug_query_report.zig b/examples/debug_query_report.zig index 9ac09c9..50a70c5 100644 --- a/examples/debug_query_report.zig +++ b/examples/debug_query_report.zig @@ -3,19 +3,19 @@ const html = @import("htmlparser"); const default_options: html.ParseOptions = .{}; const Document = default_options.GetDocument(); -fn run() !void { +pub fn run() !void { var doc = Document.init(std.testing.allocator); defer doc.deinit(); var input = "
".*; try doc.parse(&input, .{}); - var report: html.QueryDebugReport = .{}; - const node = try doc.queryOneRuntimeDebug("a[href^=https]", &report); - try std.testing.expect(node == null); - try std.testing.expect(report.visited_elements > 0); - try std.testing.expect(report.near_miss_len > 0); - try std.testing.expect(report.near_misses[0].reason.kind != .none); + const result = doc.queryOneRuntimeDebug("a[href^=https]"); + try std.testing.expect(result.err == null); + try std.testing.expect(result.node == null); + try std.testing.expect(result.report.visited_elements > 0); + try std.testing.expect(result.report.near_miss_len > 0); + try std.testing.expect(result.report.near_misses[0].reason.kind != .none); } test "query debug report for selector mismatch" { diff --git a/examples/examples.zig b/examples/examples.zig new file mode 100644 index 0000000..d93e1b0 --- /dev/null +++ b/examples/examples.zig @@ -0,0 +1,9 @@ +pub const basic_parse_query = @import("basic_parse_query.zig"); +pub const runtime_selector = @import("runtime_selector.zig"); +pub const cached_selector = @import("cached_selector.zig"); +pub const navigation_and_children = @import("navigation_and_children.zig"); +pub const inner_text_options = @import("inner_text_options.zig"); +pub const strict_vs_fastest_parse = @import("strict_vs_fastest_parse.zig"); +pub const debug_query_report = @import("debug_query_report.zig"); +pub const instrumentation_hooks = @import("instrumentation_hooks.zig"); +pub const query_time_decode = @import("query_time_decode.zig"); diff --git a/examples/inner_text_options.zig b/examples/inner_text_options.zig index 5c9fab1..e0df31c 100644 --- a/examples/inner_text_options.zig +++ b/examples/inner_text_options.zig @@ -3,7 +3,7 @@ const html = @import("htmlparser"); const default_options: html.ParseOptions = .{}; const Document = default_options.GetDocument(); -fn run() !void { +pub fn run() !void { var doc = Document.init(std.testing.allocator); defer doc.deinit(); diff --git a/examples/instrumentation_hooks.zig b/examples/instrumentation_hooks.zig index 490d4e9..5dfe6aa 100644 --- a/examples/instrumentation_hooks.zig +++ b/examples/instrumentation_hooks.zig @@ -26,17 +26,17 @@ const Hooks = struct { } }; -fn run() !void { +pub fn run() !void { var doc = Document.init(std.testing.allocator); defer doc.deinit(); var hooks: Hooks = .{}; var input = "owned
= slice.len) return null; +test "scanTextRun tracks next tag and whitespace-only runs" { + const a = scanTextRun(" \n\t", 0); + try std.testing.expectEqual(@as(usize, 3), a.lt_index); + try std.testing.expect(!a.has_non_whitespace); - var i: usize = start_index; - if (!@inComptime()) { - if (std.simd.suggestVectorLength(u8)) |block_len| { - // For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result - // in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning. - // - // Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function - // however this usually isn't necessary unless your arch has a performance penalty due to this. - // - // This may differ for other arch's. Arm for example costs a cycle when loading across a cache - // line so explicit alignment prologues may be worth exploration. - - // Unrolling here is ~10% improvement. We can then do one bounds check every 2 blocks - // instead of one which adds up. - const Block = @Vector(block_len, u8); - if (i + 2 * block_len < slice.len) { - const mask: Block = @splat(value); - while (true) { - inline for (0..2) |_| { - const block: Block = slice[i..][0..block_len].*; - const matches = block == mask; - if (@reduce(.Or, matches)) { - return i + std.simd.firstTrue(matches).?; - } - i += block_len; - } - if (i + 2 * block_len >= slice.len) break; - } - } + const b = scanTextRun(" hi", 0); + try std.testing.expectEqual(@as(usize, 4), b.lt_index); + try std.testing.expect(b.has_non_whitespace); - // {block_len, block_len / 2} check - inline for (0..2) |j| { - const block_x_len = block_len / (1 << j); - comptime if (block_x_len < 4) break; - - const BlockX = @Vector(block_x_len, u8); - if (i + block_x_len < slice.len) { - const mask: BlockX = @splat(value); - const block: BlockX = slice[i..][0..block_x_len].*; - const matches = block == mask; - if (@reduce(.Or, matches)) { - return i + std.simd.firstTrue(matches).?; - } - i += block_x_len; - } - } - } - } - - for (slice[i..], i..) |c, j| { - if (c == value) return j; - } - return null; -} - -test "findByte helper matches scalar behavior" { - const s = "abc" { diff --git a/src/html/tables.zig b/src/html/tables.zig index bd352b3..97043a3 100644 --- a/src/html/tables.zig +++ b/src/html/tables.zig @@ -22,23 +22,23 @@ pub fn makeLowerTable() [256]u8 { } /// Returns whether byte is ASCII whitespace relevant to HTML tokenization. -pub fn isWhitespace(c: u8) bool { +fn isWhitespace(c: u8) bool { return c == ' ' or c == '\n' or c == '\r' or c == '\t' or c == '\x0c'; } /// Returns whether byte is a valid identifier start. -pub fn isIdentStart(c: u8) bool { +fn isIdentStart(c: u8) bool { return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == '_' or c == ':'; } /// Returns whether byte is a valid identifier continuation. -pub fn isIdentChar(c: u8) bool { +fn isIdentChar(c: u8) bool { return isIdentStart(c) or (c >= '0' and c <= '9') or c == '-' or c == '.'; } /// Returns whether byte is consumed by the HTML tag-name state. /// Matches the tokenizer shape: continue until whitespace, `/`, `>`, or NUL. -pub fn isTagNameChar(c: u8) bool { +fn isTagNameChar(c: u8) bool { return !isWhitespace(c) and c != '/' and c != '>' and c != 0; } @@ -53,6 +53,11 @@ pub const IdentCharTable = makeClassTable(isIdentChar); /// Precomputed tag-name-char classification table. pub const TagNameCharTable = makeClassTable(isTagNameChar); +/// 32-bit FNV-1a offset basis. +pub const FnvOffset: u32 = 2166136261; +/// 32-bit FNV-1a prime. +pub const FnvPrime: u32 = 16777619; + /// Lowercases one ASCII byte using `LowerTable`. pub inline fn lower(c: u8) u8 { return LowerTable[c]; @@ -78,6 +83,20 @@ pub fn startsWithIgnoreCaseAscii(hay: []const u8, needle: []const u8) bool { return eqlIgnoreCaseAscii(hay[0..needle.len], needle); } +/// Hashes ASCII bytes case-insensitively using 32-bit FNV-1a. +pub fn hashIgnoreCaseAscii(bytes: []const u8) u32 { + var h: u32 = FnvOffset; + for (bytes) |c| { + h = hashIgnoreCaseAsciiUpdate(h, c); + } + return h; +} + +/// Incremental step for ASCII case-insensitive FNV-1a hashing. +pub inline fn hashIgnoreCaseAsciiUpdate(h: u32, c: u8) u32 { + return (h ^ @as(u32, lower(c))) *% FnvPrime; +} + /// Trims ASCII whitespace from both ends of `slice`. pub fn trimAsciiWhitespace(slice: []const u8) []const u8 { var start: usize = 0; diff --git a/src/html/tags.zig b/src/html/tags.zig index f6d04e6..5bfd00e 100644 --- a/src/html/tags.zig +++ b/src/html/tags.zig @@ -208,7 +208,12 @@ pub inline fn mayTriggerImplicitCloseWithKey(new_tag: []const u8, new_key: u64) /// Returns true when `open_tag` is an optional-close source tag. pub fn isImplicitCloseSourceWithKey(open_tag: []const u8, open_key: u64) bool { - return switch (open_tag.len) { + return isImplicitCloseSourceWithLenAndKey(open_tag.len, open_key); +} + +/// Returns true when `open_tag_len`/`open_key` represent an optional-close source tag. +pub fn isImplicitCloseSourceWithLenAndKey(open_tag_len: usize, open_key: u64) bool { + return switch (open_tag_len) { 1 => open_key == KEY.P, 2 => switch (open_key) { KEY.LI, @@ -234,7 +239,12 @@ pub fn isImplicitCloseSourceWithKey(open_tag: []const u8, open_key: u64) bool { /// Optional-close predicate with precomputed `(len,key)` fast path. pub fn shouldImplicitlyCloseWithKeys(open_tag: []const u8, open_key: u64, new_tag: []const u8, new_key: u64) bool { - return switch (open_tag.len) { + return shouldImplicitlyCloseWithLenAndKey(open_tag.len, open_key, new_tag, new_key); +} + +/// Optional-close predicate with caller-provided open-tag length. +pub fn shouldImplicitlyCloseWithLenAndKey(open_tag_len: usize, open_key: u64, new_tag: []const u8, new_key: u64) bool { + return switch (open_tag_len) { 1 => open_key == KEY.P and closesPWithKey(new_tag, new_key), 2 => switch (open_key) { KEY.LI => new_key == KEY.LI, diff --git a/src/main.zig b/src/main.zig index f41a72d..8ddb598 100644 --- a/src/main.zig +++ b/src/main.zig @@ -4,9 +4,9 @@ const default_options: htmlparser.ParseOptions = .{}; const Document = default_options.GetDocument(); /// Minimal stdout smoke print used by the demo executable. -pub fn bufferedPrint() !void { +pub fn bufferedPrint(io: std.Io) !void { var stdout_buffer: [1024]u8 = undefined; - var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); + var stdout_writer = std.Io.File.stdout().writer(io, &stdout_buffer); const stdout = &stdout_writer.interface; try stdout.print("htmlparser: run `zig build test`\n", .{}); @@ -14,8 +14,8 @@ pub fn bufferedPrint() !void { } /// Demo executable entrypoint that parses a tiny document and prints one query result. -pub fn main() !void { - try bufferedPrint(); +pub fn main(init: std.process.Init) !void { + try bufferedPrint(init.io); var doc = Document.init(std.heap.page_allocator); defer doc.deinit(); diff --git a/src/root.zig b/src/root.zig index 5011044..a65aaea 100644 --- a/src/root.zig +++ b/src/root.zig @@ -97,10 +97,10 @@ test "writeHtml serializes node subtree" { const div = doc.queryOne("div") orelse return error.TestUnexpectedResult; - var out = std.ArrayList(u8).empty; - defer out.deinit(alloc); - try div.writeHtml(out.writer(alloc)); - try std.testing.expectEqualStrings("v", out.items); + var out: std.Io.Writer.Allocating = .init(alloc); + defer out.deinit(); + try div.writeHtml(&out.writer); + try std.testing.expectEqualStrings("v", out.written()); } test "writeHtml respects in-place attr parsing and void tags" { @@ -118,10 +118,10 @@ test "writeHtml respects in-place attr parsing and void tags" { _ = img.getAttributeValue("class") orelse return error.TestUnexpectedResult; _ = img.getAttributeValue("data-q") orelse return error.TestUnexpectedResult; - var out = std.ArrayList(u8).empty; - defer out.deinit(alloc); - try img.writeHtml(out.writer(alloc)); - try std.testing.expectEqualStrings("2\">", out.items); + var out: std.Io.Writer.Allocating = .init(alloc); + defer out.deinit(); + try img.writeHtml(&out.writer); + try std.testing.expectEqualStrings("
2\">", out.written()); } test "writeHtml reflects in-place text decoding" { @@ -138,10 +138,10 @@ test "writeHtml reflects in-place text decoding" { const p = doc.queryOne("p") orelse return error.TestUnexpectedResult; _ = try p.innerText(alloc); - var out = std.ArrayList(u8).empty; - defer out.deinit(alloc); - try p.writeHtml(out.writer(alloc)); - try std.testing.expectEqualStrings("
& <
", out.items); + var out: std.Io.Writer.Allocating = .init(alloc); + defer out.deinit(); + try p.writeHtml(&out.writer); + try std.testing.expectEqualStrings("& <
", out.written()); } test "writeHtml drops whitespace-only text nodes when configured" { @@ -157,10 +157,10 @@ test "writeHtml drops whitespace-only text nodes when configured" { const div = doc.queryOne("div") orelse return error.TestUnexpectedResult; - var out = std.ArrayList(u8).empty; - defer out.deinit(alloc); - try div.writeHtml(out.writer(alloc)); - try std.testing.expectEqualStrings("a b c", out.items); + var out: std.Io.Writer.Allocating = .init(alloc); + defer out.deinit(); + try div.writeHtml(&out.writer); + try std.testing.expectEqualStrings("a b c", out.written()); } test "writeHtml parses and prints complex document" { @@ -229,8 +229,8 @@ test "writeHtmlSelf excludes children" { const div = doc.queryOne("div") orelse return error.TestUnexpectedResult; - var out = std.ArrayList(u8).empty; - defer out.deinit(alloc); - try div.writeHtmlSelf(out.writer(alloc)); - try std.testing.expectEqualStrings("", out.items); + var out: std.Io.Writer.Allocating = .init(alloc); + defer out.deinit(); + try div.writeHtmlSelf(&out.writer); + try std.testing.expectEqualStrings("", out.written()); } diff --git a/src/selector/ast.zig b/src/selector/ast.zig index 182642d..1bd1c97 100644 --- a/src/selector/ast.zig +++ b/src/selector/ast.zig @@ -9,7 +9,7 @@ pub const Combinator = enum(u8) { sibling, /// Formats this combinator for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.writeAll(@tagName(self)); } }; @@ -25,7 +25,7 @@ pub const AttrOp = enum(u8) { dash_match, /// Formats this attribute operator for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.writeAll(@tagName(self)); } }; @@ -61,7 +61,7 @@ pub const Range = extern struct { } /// Formats this range for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.print("Range{{start={}, len={}}}", .{ self.start, self.len }); } }; @@ -74,7 +74,7 @@ pub const AttrSelector = extern struct { value: Range = .{}, /// Formats this attribute selector for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.writeAll("AttrSelector{name="); try self.name.format(writer); try writer.print(", name_hash={}, op={s}, value=", .{ self.name_hash, @tagName(self.op) }); @@ -100,7 +100,7 @@ pub const NthExpr = extern struct { } /// Formats this nth expression for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.print("NthExpr{{a={}, b={}}}", .{ self.a, self.b }); } }; @@ -112,7 +112,7 @@ pub const PseudoKind = enum(u8) { nth_child, /// Formats this pseudo kind for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.writeAll(@tagName(self)); } }; @@ -123,7 +123,7 @@ pub const Pseudo = extern struct { nth: NthExpr = .{ .a = 0, .b = 1 }, /// Formats this pseudo selector for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.print("Pseudo{{kind={s}, nth=", .{@tagName(self.kind)}); try self.nth.format(writer); try writer.writeAll("}"); @@ -138,7 +138,7 @@ pub const NotKind = enum(u8) { attr, /// Formats this not-kind for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.writeAll(@tagName(self)); } }; @@ -150,7 +150,7 @@ pub const NotSimple = extern struct { attr: AttrSelector = .{ .name = .{}, .op = .exists, .value = .{} }, /// Formats this `:not` predicate for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.print("NotSimple{{kind={s}, text=", .{@tagName(self.kind)}); try self.text.format(writer); try writer.writeAll(", attr="); @@ -163,11 +163,8 @@ pub const NotSimple = extern struct { pub const Compound = extern struct { combinator: Combinator = .none, - has_tag: u8 = 0, tag: Range = .{}, tag_key: u64 = 0, - - has_id: u8 = 0, id: Range = .{}, class_start: u32 = 0, @@ -182,11 +179,19 @@ pub const Compound = extern struct { not_start: u32 = 0, not_len: u32 = 0, + pub fn hasTag(self: @This()) bool { + return !self.tag.isEmpty(); + } + + pub fn hasId(self: @This()) bool { + return !self.id.isEmpty(); + } + /// Formats this compound selector for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { - try writer.print("Compound{{combinator={s}, has_tag={}, tag=", .{ @tagName(self.combinator), self.has_tag }); + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { + try writer.print("Compound{{combinator={s}, tag=", .{@tagName(self.combinator)}); try self.tag.format(writer); - try writer.print(", tag_key={}, has_id={}, id=", .{ self.tag_key, self.has_id }); + try writer.print(", tag_key={}, id=", .{self.tag_key}); try self.id.format(writer); try writer.print( ", class_start={}, class_len={}, attr_start={}, attr_len={}, pseudo_start={}, pseudo_len={}, not_start={}, not_len={}}}", @@ -210,7 +215,7 @@ pub const Group = extern struct { compound_len: u32, /// Formats this selector group for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.print("Group{{compound_start={}, compound_len={}}}", .{ self.compound_start, self.compound_len }); } }; @@ -218,7 +223,6 @@ pub const Group = extern struct { /// Compiled selector used by matcher/query APIs. pub const Selector = struct { source: []const u8, - requires_parent: bool = false, groups: []const Group, compounds: []const Compound, classes: []const Range, @@ -249,12 +253,11 @@ pub const Selector = struct { } /// Formats this selector summary for human-readable output. - pub fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.print( - "Selector{{source=\"{s}\", requires_parent={}, groups={}, compounds={}, classes={}, attrs={}, pseudos={}, not_items={}}}", + "Selector{{source=\"{s}\", groups={}, compounds={}, classes={}, attrs={}, pseudos={}, not_items={}}}", .{ self.source, - self.requires_parent, self.groups.len, self.compounds.len, self.classes.len, @@ -328,10 +331,8 @@ test "format selector AST types" { const compound: Compound = .{ .combinator = .child, - .has_tag = 1, .tag = Range.from(0, 3), .tag_key = 0xabc, - .has_id = 1, .id = Range.from(4, 6), .class_start = 1, .class_len = 2, @@ -345,7 +346,7 @@ test "format selector AST types" { const compound_out = try std.fmt.allocPrint(alloc, "{f}", .{compound}); defer alloc.free(compound_out); try std.testing.expectEqualStrings( - "Compound{combinator=child, has_tag=1, tag=Range{start=0, len=3}, tag_key=2748, has_id=1, id=Range{start=4, len=2}, class_start=1, class_len=2, attr_start=3, attr_len=4, pseudo_start=5, pseudo_len=6, not_start=7, not_len=8}", + "Compound{combinator=child, tag=Range{start=0, len=3}, tag_key=2748, id=Range{start=4, len=2}, class_start=1, class_len=2, attr_start=3, attr_len=4, pseudo_start=5, pseudo_len=6, not_start=7, not_len=8}", compound_out, ); @@ -362,7 +363,6 @@ test "format selector AST types" { const not_items = [_]NotSimple{not_simple}; const selector: Selector = .{ .source = "div.cls", - .requires_parent = true, .groups = groups[0..], .compounds = compounds[0..], .classes = classes[0..], @@ -373,7 +373,7 @@ test "format selector AST types" { const selector_out = try std.fmt.allocPrint(alloc, "{f}", .{selector}); defer alloc.free(selector_out); try std.testing.expectEqualStrings( - "Selector{source=\"div.cls\", requires_parent=true, groups=1, compounds=1, classes=1, attrs=1, pseudos=1, not_items=1}", + "Selector{source=\"div.cls\", groups=1, compounds=1, classes=1, attrs=1, pseudos=1, not_items=1}", selector_out, ); } diff --git a/src/selector/compile_time.zig b/src/selector/compile_time.zig index 46fd8b0..d549bf0 100644 --- a/src/selector/compile_time.zig +++ b/src/selector/compile_time.zig @@ -1,6 +1,7 @@ const std = @import("std"); const ast = @import("ast.zig"); const runtime = @import("runtime.zig"); +const test_helpers = @import("test_helpers.zig"); /// Allocator facade that allows selector compilation during comptime execution. pub const ComptimeAllocator = struct { @@ -45,7 +46,7 @@ pub const ComptimeAllocator = struct { } /// Formats this allocator marker for human-readable output. - pub fn format(_: @This(), writer: *std.io.Writer) std.io.Writer.Error!void { + pub fn format(_: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { try writer.writeAll("ComptimeAllocator{}"); } }; @@ -65,7 +66,6 @@ pub fn compileImpl(comptime source: []const u8) ast.Selector { return .{ .source = source, - .requires_parent = parsed.requires_parent, .groups = groups, .compounds = compounds, .classes = classes, @@ -89,31 +89,12 @@ test "compile-time parser" { test "compile-time parser covers all attribute operators" { const sel = comptime compileImpl("div[a][b=v][c^=x][d$=y][e*=z][f~=m][g|=en]"); - try std.testing.expectEqual(@as(usize, 1), sel.groups.len); - try std.testing.expectEqual(@as(usize, 1), sel.compounds.len); - - const comp = sel.compounds[0]; - try std.testing.expectEqual(@as(u32, 7), comp.attr_len); - try std.testing.expect(sel.attrs[comp.attr_start + 0].op == .exists); - try std.testing.expect(sel.attrs[comp.attr_start + 1].op == .eq); - try std.testing.expect(sel.attrs[comp.attr_start + 2].op == .prefix); - try std.testing.expect(sel.attrs[comp.attr_start + 3].op == .suffix); - try std.testing.expect(sel.attrs[comp.attr_start + 4].op == .contains); - try std.testing.expect(sel.attrs[comp.attr_start + 5].op == .includes); - try std.testing.expect(sel.attrs[comp.attr_start + 6].op == .dash_match); + try test_helpers.expectAllAttributeOps(sel); } test "compile-time parser tracks combinator chain and grouping" { const sel = comptime compileImpl("a b > c + d ~ e, #x"); - try std.testing.expectEqual(@as(usize, 2), sel.groups.len); - try std.testing.expectEqual(@as(usize, 6), sel.compounds.len); - - try std.testing.expect(sel.compounds[0].combinator == .none); - try std.testing.expect(sel.compounds[1].combinator == .descendant); - try std.testing.expect(sel.compounds[2].combinator == .child); - try std.testing.expect(sel.compounds[3].combinator == .adjacent); - try std.testing.expect(sel.compounds[4].combinator == .sibling); - try std.testing.expect(sel.compounds[5].combinator == .none); + try test_helpers.expectCombinatorChain(sel); } test "compile-time parser supports leading combinator and nth-child variants" { diff --git a/src/selector/matcher.zig b/src/selector/matcher.zig index 9a44780..e32847b 100644 --- a/src/selector/matcher.zig +++ b/src/selector/matcher.zig @@ -5,13 +5,20 @@ const tags = @import("../html/tags.zig"); const attr_inline = @import("../html/attr_inline.zig"); const common = @import("../common.zig"); +// SAFETY: Selector AST indices are trusted to be internally consistent +// (group/compound/predicate ranges). Document node indices are validated +// before use; debug asserts guard scope bounds in key entry points. + const InvalidIndex: u32 = common.InvalidIndex; const MaxProbeEntries: usize = 24; const MaxCollectedAttrs: usize = 24; const LocalMatchFrameCap: usize = 48; -const HashId: u32 = hashIgnoreCaseAscii("id"); -const HashClass: u32 = hashIgnoreCaseAscii("class"); +const HashId: u32 = tables.hashIgnoreCaseAscii("id"); +const HashClass: u32 = tables.hashIgnoreCaseAscii("class"); const EnableQueryAccel = true; +// Tag-index accel has shown unstable behavior under optimized bench builds. +// Keep id accel enabled and fall back to scan for tag-only pruning for now. +const EnableTagQueryAccel = false; const EnableMultiAttrCollect = true; const isElementLike = common.isElementLike; const matchesScopeAnchor = common.matchesScopeAnchor; @@ -19,8 +26,122 @@ const parentElement = common.parentElement; const prevElementSibling = common.prevElementSibling; const nextElementSibling = common.nextElementSibling; +pub const TraversalBounds = struct { + start: u32, + end_excl: u32, +}; + +pub fn traversalBounds(comptime Doc: type, doc: *const Doc, scope_root: u32) TraversalBounds { + if (scope_root != InvalidIndex and scope_root >= doc.nodes.items.len) { + return .{ .start = 1, .end_excl = 1 }; + } + const start: u32 = if (scope_root == InvalidIndex) 1 else scope_root + 1; + const end_excl: u32 = if (scope_root == InvalidIndex) + @as(u32, @intCast(doc.nodes.items.len)) + else + doc.nodes.items[scope_root].subtree_end + 1; + return .{ .start = start, .end_excl = end_excl }; +} + +pub fn tagMatches(selector_source: []const u8, comp: ast.Compound, node_name: []const u8) bool { + const tag = comp.tag.slice(selector_source); + const tag_key: u64 = if (comp.tag_key != 0) comp.tag_key else tags.first8Key(tag); + const node_key = tags.first8Key(node_name); + return tags.equalByLenAndKeyIgnoreCase(node_name, node_key, tag, tag_key); +} + +pub fn evalAttrOp(raw: []const u8, value: []const u8, op: ast.AttrOp) bool { + return switch (op) { + .exists => true, + .eq => std.mem.eql(u8, raw, value), + .prefix => std.mem.startsWith(u8, raw, value), + .suffix => std.mem.endsWith(u8, raw, value), + .contains => std.mem.indexOf(u8, raw, value) != null, + .includes => tables.tokenIncludesAsciiWhitespace(raw, value), + .dash_match => std.mem.eql(u8, raw, value) or (raw.len > value.len and std.mem.startsWith(u8, raw, value) and raw[value.len] == '-'), + }; +} + +pub fn matchesAttrSelectorDebug( + doc: anytype, + node: anytype, + selector_source: []const u8, + sel: ast.AttrSelector, +) bool { + const name = sel.name.slice(selector_source); + const raw = attr_inline.getAttrValue(doc, node, name) orelse return false; + const value = sel.value.slice(selector_source); + return evalAttrOp(raw, value, sel.op); +} + +pub fn matchesNotSimpleCommon(ctx: anytype, item: ast.NotSimple) bool { + return switch (item.kind) { + .tag => tables.eqlIgnoreCaseAscii(ctx.nodeName(), item.text.slice(ctx.selector_source)), + .id => blk: { + const id = item.text.slice(ctx.selector_source); + const v = ctx.getAttrValue("id") orelse break :blk false; + break :blk std.mem.eql(u8, v, id); + }, + .class => ctx.classMatches(item.text.slice(ctx.selector_source)), + .attr => ctx.attrMatches(item.attr), + }; +} + +pub fn NotSimpleCtxFast(comptime Doc: type, comptime Node: type) type { + return struct { + doc: Doc, + node: Node, + probe: *AttrProbe, + collected: ?*CollectedAttrs, + selector_source: []const u8, + + fn nodeName(self: @This()) []const u8 { + return self.node.name_or_text.slice(self.doc.source); + } + + fn getAttrValue(self: @This(), name: []const u8) ?[]const u8 { + const hash = if (std.mem.eql(u8, name, "id")) HashId else tables.hashIgnoreCaseAscii(name); + return attrValueByHashFrom(self.doc, self.node, self.probe, self.collected, name, hash); + } + + fn classMatches(self: @This(), class_name: []const u8) bool { + return hasClass(self.doc, self.node, self.probe, self.collected, class_name); + } + + fn attrMatches(self: @This(), sel: ast.AttrSelector) bool { + return matchesAttrSelector(self.doc, self.node, self.probe, self.collected, self.selector_source, sel); + } + }; +} + +pub fn NotSimpleCtxDebug(comptime Doc: type, comptime Node: type) type { + return struct { + doc: Doc, + node: Node, + selector_source: []const u8, + + fn nodeName(self: @This()) []const u8 { + return self.node.name_or_text.slice(self.doc.source); + } + + fn getAttrValue(self: @This(), name: []const u8) ?[]const u8 { + return attr_inline.getAttrValue(self.doc, self.node, name); + } + + fn classMatches(self: @This(), class_name: []const u8) bool { + const class_attr = attr_inline.getAttrValue(self.doc, self.node, "class") orelse return false; + return tables.tokenIncludesAsciiWhitespace(class_attr, class_name); + } + + fn attrMatches(self: @This(), sel: ast.AttrSelector) bool { + return matchesAttrSelectorDebug(self.doc, self.node, self.selector_source, sel); + } + }; +} + /// Returns first matching node index for `selector` within optional `scope_root`. pub fn queryOneIndex(comptime Doc: type, noalias doc: *const Doc, selector: ast.Selector, scope_root: u32) ?u32 { + if (scope_root != InvalidIndex and scope_root >= doc.nodes.items.len) return null; var best: ?u32 = null; for (selector.groups) |group| { if (group.compound_len == 0) continue; @@ -32,6 +153,7 @@ pub fn queryOneIndex(comptime Doc: type, noalias doc: *const Doc, selector: ast. /// Returns whether `node_index` matches any selector group within scope. pub fn matchesSelectorAt(comptime Doc: type, noalias doc: *const Doc, selector: ast.Selector, node_index: u32, scope_root: u32) bool { + if (scope_root != InvalidIndex and scope_root >= doc.nodes.items.len) return false; for (selector.groups) |group| { if (group.compound_len == 0) continue; const rightmost = group.compound_len - 1; @@ -183,49 +305,48 @@ fn firstMatchForGroup(comptime Doc: type, doc: *const Doc, selector: ast.Selecto const comp_abs: usize = @intCast(group.compound_start + rightmost); const comp = selector.compounds[comp_abs]; - if (EnableQueryAccel and @hasDecl(Doc, "queryAccelLookupId") and comp.has_id != 0) { + if (EnableQueryAccel and @hasDecl(Doc, "queryAccelLookupId") and comp.hasId()) { const id = comp.id.slice(selector.source); - var used = false; - if (doc.queryAccelLookupId(id, &used)) |idx| { - if (inScope(doc, idx, scope_root) and matchGroupFromRight(Doc, doc, selector, group, rightmost, idx, scope_root)) { - return idx; - } - // Duplicate ids are common in real HTML. If the accelerated hit does not - // satisfy scope/other predicates, fall back to full scan semantics. - } else if (used) { - return null; + switch (doc.queryAccelLookupId(id)) { + .hit => |idx| { + if (inScope(doc, idx, scope_root) and matchGroupFromRight(Doc, doc, selector, group, rightmost, idx, scope_root)) { + return idx; + } + // Duplicate ids are common in real HTML. If the accelerated hit does not + // satisfy scope/other predicates, fall back to full scan semantics. + }, + .miss => return null, + .unavailable => {}, } } - if (EnableQueryAccel and @hasDecl(Doc, "queryAccelLookupTag") and comp.has_tag != 0 and comp.tag_key != 0) { - var used = false; + if (EnableTagQueryAccel and EnableQueryAccel and @hasDecl(Doc, "queryAccelLookupTag") and comp.hasTag()) { const tag = comp.tag.slice(selector.source); - if (doc.queryAccelLookupTag(tag, comp.tag_key, &used)) |candidates| { - if (scope_root != InvalidIndex) { - const scope_end = doc.nodes.items[scope_root].subtree_end; + const tag_key = if (comp.tag_key != 0) comp.tag_key else tags.first8Key(tag); + switch (doc.queryAccelLookupTag(tag, tag_key)) { + .hit => |candidates| { + if (scope_root != InvalidIndex) { + if (scope_root >= doc.nodes.items.len) return null; + const scope_end = doc.nodes.items[scope_root].subtree_end; + for (candidates) |idx| { + if (idx <= scope_root) continue; + if (idx > scope_end) break; + if (matchGroupFromRight(Doc, doc, selector, group, rightmost, idx, scope_root)) return idx; + } + return null; + } for (candidates) |idx| { - if (idx <= scope_root) continue; - if (idx > scope_end) break; if (matchGroupFromRight(Doc, doc, selector, group, rightmost, idx, scope_root)) return idx; } return null; - } - for (candidates) |idx| { - if (matchGroupFromRight(Doc, doc, selector, group, rightmost, idx, scope_root)) return idx; - } - return null; + }, + .unavailable => {}, } - if (used) return null; } - const start: u32 = if (scope_root == InvalidIndex) 1 else scope_root + 1; - const end_excl: u32 = if (scope_root == InvalidIndex) - @as(u32, @intCast(doc.nodes.items.len)) - else - doc.nodes.items[scope_root].subtree_end + 1; - - var i = start; - while (i < end_excl and i < doc.nodes.items.len) : (i += 1) { + const bounds = traversalBounds(Doc, doc, scope_root); + var i = bounds.start; + while (i < bounds.end_excl and i < doc.nodes.items.len) : (i += 1) { const node = &doc.nodes.items[i]; if (!isElementLike(node.kind)) continue; if (matchGroupFromRight(Doc, doc, selector, group, rightmost, i, scope_root)) return i; @@ -250,15 +371,12 @@ fn matchesCompound(comptime Doc: type, noalias doc: *const Doc, selector: ast.Se const use_collected = EnableMultiAttrCollect and prepareCollectedAttrs(selector, comp, &collected_attrs); const collected_ptr: ?*CollectedAttrs = if (use_collected) &collected_attrs else null; - if (comp.has_tag != 0) { - const tag = comp.tag.slice(selector.source); - const tag_key: u64 = if (comp.tag_key != 0) comp.tag_key else tags.first8Key(tag); + if (comp.hasTag()) { const node_name = node.name_or_text.slice(doc.source); - const node_key = tags.first8Key(node_name); - if (!tags.equalByLenAndKeyIgnoreCase(node_name, node_key, tag, tag_key)) return false; + if (!tagMatches(selector.source, comp, node_name)) return false; } - if (comp.has_id != 0) { + if (comp.hasId()) { const id = comp.id.slice(selector.source); const value = attrValueByHashFrom( doc, @@ -312,19 +430,18 @@ fn matchesNotSimple( selector_source: []const u8, item: ast.NotSimple, ) bool { - return switch (item.kind) { - .tag => tables.eqlIgnoreCaseAscii(node.name_or_text.slice(doc.source), item.text.slice(selector_source)), - .id => blk: { - const id = item.text.slice(selector_source); - const v = attrValueByHashFrom(doc, node, probe, collected, "id", HashId) orelse break :blk false; - break :blk std.mem.eql(u8, v, id); - }, - .class => hasClass(doc, node, probe, collected, item.text.slice(selector_source)), - .attr => matchesAttrSelector(doc, node, probe, collected, selector_source, item.attr), + const Ctx = NotSimpleCtxFast(@TypeOf(doc), @TypeOf(node)); + const ctx = Ctx{ + .doc = doc, + .node = node, + .probe = probe, + .collected = collected, + .selector_source = selector_source, }; + return matchesNotSimpleCommon(ctx, item); } -fn matchesPseudo(doc: anytype, node_index: u32, pseudo: ast.Pseudo) bool { +pub fn matchesPseudo(doc: anytype, node_index: u32, pseudo: ast.Pseudo) bool { return switch (pseudo.kind) { .first_child => prevElementSibling(doc, node_index) == null, .last_child => nextElementSibling(doc, node_index) == null, @@ -349,19 +466,10 @@ fn matchesAttrSelector( sel: ast.AttrSelector, ) bool { const name = sel.name.slice(selector_source); - const name_hash = if (sel.name_hash != 0) sel.name_hash else hashIgnoreCaseAscii(name); + const name_hash = if (sel.name_hash != 0) sel.name_hash else tables.hashIgnoreCaseAscii(name); const raw = attrValueByHashFrom(doc, node, probe, collected, name, name_hash) orelse return false; const value = sel.value.slice(selector_source); - - return switch (sel.op) { - .exists => true, - .eq => std.mem.eql(u8, raw, value), - .prefix => std.mem.startsWith(u8, raw, value), - .suffix => std.mem.endsWith(u8, raw, value), - .contains => std.mem.indexOf(u8, raw, value) != null, - .includes => tables.tokenIncludesAsciiWhitespace(raw, value), - .dash_match => std.mem.eql(u8, raw, value) or (raw.len > value.len and std.mem.startsWith(u8, raw, value) and raw[value.len] == '-'), - }; + return evalAttrOp(raw, value, sel.op); } fn hasClass(doc: anytype, node: anytype, noalias probe: *AttrProbe, collected: ?*CollectedAttrs, class_name: []const u8) bool { @@ -491,14 +599,14 @@ const CollectedAttrs = struct { fn prepareCollectedAttrs(selector: ast.Selector, comp: ast.Compound, out: *CollectedAttrs) bool { out.* = .{}; - if (comp.has_id != 0 and !pushCollectedName(out, "id", HashId)) return false; + if (comp.hasId() and !pushCollectedName(out, "id", HashId)) return false; if (comp.class_len != 0 and !pushCollectedName(out, "class", HashClass)) return false; var attr_i: u32 = 0; while (attr_i < comp.attr_len) : (attr_i += 1) { const attr_sel = selector.attrs[comp.attr_start + attr_i]; const name = attr_sel.name.slice(selector.source); - const hash = if (attr_sel.name_hash != 0) attr_sel.name_hash else hashIgnoreCaseAscii(name); + const hash = if (attr_sel.name_hash != 0) attr_sel.name_hash else tables.hashIgnoreCaseAscii(name); if (!pushCollectedName(out, name, hash)) return false; } @@ -510,7 +618,7 @@ fn prepareCollectedAttrs(selector: ast.Selector, comp: ast.Compound, out: *Colle .class => if (!pushCollectedName(out, "class", HashClass)) return false, .attr => { const name = item.attr.name.slice(selector.source); - const hash = if (item.attr.name_hash != 0) item.attr.name_hash else hashIgnoreCaseAscii(name); + const hash = if (item.attr.name_hash != 0) item.attr.name_hash else tables.hashIgnoreCaseAscii(name); if (!pushCollectedName(out, name, hash)) return false; }, else => {}, @@ -553,11 +661,3 @@ fn findProbeEntry(noalias probe: *const AttrProbe, needle: []const u8, needle_ha } return null; } - -fn hashIgnoreCaseAscii(bytes: []const u8) u32 { - var h: u32 = 2166136261; - for (bytes) |c| { - h = (h ^ @as(u32, tables.lower(c))) *% 16777619; - } - return h; -} diff --git a/src/selector/matcher_debug.zig b/src/selector/matcher_debug.zig index ef9335b..5cb13de 100644 --- a/src/selector/matcher_debug.zig +++ b/src/selector/matcher_debug.zig @@ -7,12 +7,8 @@ const attr_inline = @import("../html/attr_inline.zig"); const selector_debug = @import("../debug/selector_debug.zig"); const common = @import("../common.zig"); -const InvalidIndex: u32 = common.InvalidIndex; -const isElementLike = common.isElementLike; -const matchesScopeAnchor = common.matchesScopeAnchor; -const parentElement = common.parentElement; -const prevElementSibling = common.prevElementSibling; -const nextElementSibling = common.nextElementSibling; +// SAFETY: Debug matcher uses the same traversal bounds as the fast matcher +// and only indexes validated node ranges. /// Debug matcher that returns first match and records near-miss diagnostics. pub fn explainFirstMatch( @@ -24,16 +20,12 @@ pub fn explainFirstMatch( ) ?u32 { report.reset(selector.source, scope_root, selector.groups.len); - const start: u32 = if (scope_root == InvalidIndex) 1 else scope_root + 1; - const end_excl: u32 = if (scope_root == InvalidIndex) - @as(u32, @intCast(doc.nodes.items.len)) - else - doc.nodes.items[scope_root].subtree_end + 1; + const bounds = matcher.traversalBounds(Doc, doc, scope_root); - var i = start; - while (i < end_excl and i < doc.nodes.items.len) : (i += 1) { + var i = bounds.start; + while (i < bounds.end_excl and i < doc.nodes.items.len) : (i += 1) { const node = &doc.nodes.items[i]; - if (!isElementLike(node.kind)) continue; + if (!common.isElementLike(node.kind)) continue; report.visited_elements += 1; var first_failure: selector_debug.Failure = .{}; @@ -83,7 +75,7 @@ fn classifyGroupFailure( var reason = classifyCompoundFailure(doc, selector, comp, node_index, group_index, comp_abs); if (!reason.isNone()) return reason; - if (group.compound_len == 1 and comp.combinator != .none and !matchesScopeAnchor(doc, comp.combinator, node_index, scope_root)) { + if (group.compound_len == 1 and comp.combinator != .none and !common.matchesScopeAnchor(doc, comp.combinator, node_index, scope_root)) { return .{ .kind = .scope, .group_index = @intCast(@min(group_index, std.math.maxInt(u16))), @@ -115,18 +107,15 @@ fn classifyCompoundFailure( const g: u16 = @intCast(@min(group_index, std.math.maxInt(u16))); const c: u16 = @intCast(@min(compound_index, std.math.maxInt(u16))); - if (comp.has_tag != 0) { - const tag = comp.tag.slice(selector.source); - const tag_key: u64 = if (comp.tag_key != 0) comp.tag_key else tags.first8Key(tag); + if (comp.hasTag()) { const node_name = node.name_or_text.slice(doc.source); - const node_key = tags.first8Key(node_name); - if (!tags.equalByLenAndKeyIgnoreCase(node_name, node_key, tag, tag_key)) { + if (!matcher.tagMatches(selector.source, comp, node_name)) { return .{ .kind = .tag, .group_index = g, .compound_index = c, .predicate_index = predicate_index }; } predicate_index += 1; } - if (comp.has_id != 0) { + if (comp.hasId()) { const id = comp.id.slice(selector.source); const value = attr_inline.getAttrValue(doc, node, "id") orelse return .{ .kind = .id, @@ -160,7 +149,7 @@ fn classifyCompoundFailure( var attr_i: u32 = 0; while (attr_i < comp.attr_len) : (attr_i += 1) { const attr_sel = selector.attrs[comp.attr_start + attr_i]; - if (!matchesAttrSelector(doc, node, selector.source, attr_sel)) { + if (!matcher.matchesAttrSelectorDebug(doc, node, selector.source, attr_sel)) { return .{ .kind = .attr, .group_index = g, .compound_index = c, .predicate_index = predicate_index }; } predicate_index += 1; @@ -169,7 +158,7 @@ fn classifyCompoundFailure( var pseudo_i: u32 = 0; while (pseudo_i < comp.pseudo_len) : (pseudo_i += 1) { const pseudo = selector.pseudos[comp.pseudo_start + pseudo_i]; - if (!matchesPseudo(doc, node_index, pseudo)) { + if (!matcher.matchesPseudo(doc, node_index, pseudo)) { return .{ .kind = .pseudo, .group_index = g, .compound_index = c, .predicate_index = predicate_index }; } predicate_index += 1; @@ -193,54 +182,11 @@ fn matchesNotSimple( selector_source: []const u8, item: ast.NotSimple, ) bool { - return switch (item.kind) { - .tag => tables.eqlIgnoreCaseAscii(node.name_or_text.slice(doc.source), item.text.slice(selector_source)), - .id => blk: { - const id = item.text.slice(selector_source); - const v = attr_inline.getAttrValue(doc, node, "id") orelse break :blk false; - break :blk std.mem.eql(u8, v, id); - }, - .class => blk: { - const class_attr = attr_inline.getAttrValue(doc, node, "class") orelse break :blk false; - break :blk tables.tokenIncludesAsciiWhitespace(class_attr, item.text.slice(selector_source)); - }, - .attr => matchesAttrSelector(doc, node, selector_source, item.attr), - }; -} - -fn matchesAttrSelector( - doc: anytype, - node: anytype, - selector_source: []const u8, - sel: ast.AttrSelector, -) bool { - const name = sel.name.slice(selector_source); - const raw = attr_inline.getAttrValue(doc, node, name) orelse return false; - const value = sel.value.slice(selector_source); - - return switch (sel.op) { - .exists => true, - .eq => std.mem.eql(u8, raw, value), - .prefix => std.mem.startsWith(u8, raw, value), - .suffix => std.mem.endsWith(u8, raw, value), - .contains => std.mem.indexOf(u8, raw, value) != null, - .includes => tables.tokenIncludesAsciiWhitespace(raw, value), - .dash_match => std.mem.eql(u8, raw, value) or (raw.len > value.len and std.mem.startsWith(u8, raw, value) and raw[value.len] == '-'), - }; -} - -fn matchesPseudo(doc: anytype, node_index: u32, pseudo: ast.Pseudo) bool { - return switch (pseudo.kind) { - .first_child => prevElementSibling(doc, node_index) == null, - .last_child => nextElementSibling(doc, node_index) == null, - .nth_child => blk: { - _ = parentElement(doc, node_index) orelse break :blk false; - var position: usize = 1; - var prev = doc.nodes.items[node_index].prev_sibling; - while (prev != InvalidIndex) : (prev = doc.nodes.items[prev].prev_sibling) { - position += 1; - } - break :blk pseudo.nth.matches(position); - }, + const Ctx = matcher.NotSimpleCtxDebug(@TypeOf(doc), @TypeOf(node)); + const ctx = Ctx{ + .doc = doc, + .node = node, + .selector_source = selector_source, }; + return matcher.matchesNotSimpleCommon(ctx, item); } diff --git a/src/selector/runtime.zig b/src/selector/runtime.zig index f282835..afffb6b 100644 --- a/src/selector/runtime.zig +++ b/src/selector/runtime.zig @@ -2,6 +2,10 @@ const std = @import("std"); const ast = @import("ast.zig"); const tables = @import("../html/tables.zig"); const tags = @import("../html/tags.zig"); +const test_helpers = @import("test_helpers.zig"); + +// SAFETY: Runtime parser owns `source` bytes via allocator and builds AST +// slices that refer to those owned bytes. /// Runtime selector parser errors. pub const Error = error{ @@ -141,11 +145,8 @@ const Parser = struct { const not_items = try self.not_items.toOwnedSlice(self.alloc); errdefer self.alloc.free(not_items); - const requires_parent = selectorRequiresParent(compounds, pseudos); - return .{ .source = self.source, - .requires_parent = requires_parent, .groups = groups, .compounds = compounds, .classes = classes, @@ -170,7 +171,6 @@ const Parser = struct { self.i += 1; consumed = true; } else if (isTagIdentStart(c)) { - out.has_tag = 1; out.tag = self.parseIdent() orelse return error.InvalidSelector; self.lowerRange(out.tag); out.tag_key = tags.first8Key(out.tag.slice(self.source)); @@ -183,8 +183,7 @@ const Parser = struct { switch (c) { '#' => { self.i += 1; - if (out.has_id != 0) return error.InvalidSelector; - out.has_id = 1; + if (!out.id.isEmpty()) return error.InvalidSelector; out.id = self.parseIdent() orelse return error.InvalidSelector; consumed = true; }, @@ -232,31 +231,31 @@ const Parser = struct { if (!self.consumeIf('~')) { if (!self.consumeIf('|')) { if (!self.consumeIf(']')) return error.InvalidSelector; - return .{ .name = name, .name_hash = hashIgnoreCaseAscii(name.slice(self.source)), .op = .exists, .value = .{} }; + return .{ .name = name, .name_hash = tables.hashIgnoreCaseAscii(name.slice(self.source)), .op = .exists, .value = .{} }; } if (!self.consumeIf('=')) return error.InvalidSelector; const v = try self.parseAttrValueThenClose(); - return .{ .name = name, .name_hash = hashIgnoreCaseAscii(name.slice(self.source)), .op = .dash_match, .value = v }; + return .{ .name = name, .name_hash = tables.hashIgnoreCaseAscii(name.slice(self.source)), .op = .dash_match, .value = v }; } if (!self.consumeIf('=')) return error.InvalidSelector; const v = try self.parseAttrValueThenClose(); - return .{ .name = name, .name_hash = hashIgnoreCaseAscii(name.slice(self.source)), .op = .includes, .value = v }; + return .{ .name = name, .name_hash = tables.hashIgnoreCaseAscii(name.slice(self.source)), .op = .includes, .value = v }; } if (!self.consumeIf('=')) return error.InvalidSelector; const v = try self.parseAttrValueThenClose(); - return .{ .name = name, .name_hash = hashIgnoreCaseAscii(name.slice(self.source)), .op = .contains, .value = v }; + return .{ .name = name, .name_hash = tables.hashIgnoreCaseAscii(name.slice(self.source)), .op = .contains, .value = v }; } if (!self.consumeIf('=')) return error.InvalidSelector; const v = try self.parseAttrValueThenClose(); - return .{ .name = name, .name_hash = hashIgnoreCaseAscii(name.slice(self.source)), .op = .suffix, .value = v }; + return .{ .name = name, .name_hash = tables.hashIgnoreCaseAscii(name.slice(self.source)), .op = .suffix, .value = v }; } if (!self.consumeIf('=')) return error.InvalidSelector; const v = try self.parseAttrValueThenClose(); - return .{ .name = name, .name_hash = hashIgnoreCaseAscii(name.slice(self.source)), .op = .prefix, .value = v }; + return .{ .name = name, .name_hash = tables.hashIgnoreCaseAscii(name.slice(self.source)), .op = .prefix, .value = v }; } const v = try self.parseAttrValueThenClose(); - return .{ .name = name, .name_hash = hashIgnoreCaseAscii(name.slice(self.source)), .op = .eq, .value = v }; + return .{ .name = name, .name_hash = tables.hashIgnoreCaseAscii(name.slice(self.source)), .op = .eq, .value = v }; } fn parseAttrValueThenClose(noalias self: *Parser) Error!ast.Range { @@ -498,63 +497,18 @@ fn parseSignedInt(bytes: []const u8) ?i32 { return @intCast(value); } -fn hashIgnoreCaseAscii(bytes: []const u8) u32 { - var h: u32 = 2166136261; - for (bytes) |c| { - h = (h ^ @as(u32, tables.lower(c))) *% 16777619; - } - return h; -} - -fn selectorRequiresParent(compounds: []const ast.Compound, pseudos: []const ast.Pseudo) bool { - for (compounds) |comp| { - switch (comp.combinator) { - .child, .descendant => return true, - else => {}, - } - - var i: u32 = 0; - while (i < comp.pseudo_len) : (i += 1) { - const p = pseudos[comp.pseudo_start + i]; - if (p.kind == .nth_child) return true; - } - } - return false; -} - test "runtime selector parser covers all attribute operators" { const alloc = std.testing.allocator; var sel = try compileRuntimeImpl(alloc, "div[a][b=v][c^=x][d$=y][e*=z][f~=m][g|=en]"); defer sel.deinit(alloc); - - try std.testing.expectEqual(@as(usize, 1), sel.groups.len); - try std.testing.expectEqual(@as(usize, 1), sel.compounds.len); - - const comp = sel.compounds[0]; - try std.testing.expectEqual(@as(u32, 7), comp.attr_len); - try std.testing.expect(sel.attrs[comp.attr_start + 0].op == .exists); - try std.testing.expect(sel.attrs[comp.attr_start + 1].op == .eq); - try std.testing.expect(sel.attrs[comp.attr_start + 2].op == .prefix); - try std.testing.expect(sel.attrs[comp.attr_start + 3].op == .suffix); - try std.testing.expect(sel.attrs[comp.attr_start + 4].op == .contains); - try std.testing.expect(sel.attrs[comp.attr_start + 5].op == .includes); - try std.testing.expect(sel.attrs[comp.attr_start + 6].op == .dash_match); + try test_helpers.expectAllAttributeOps(sel); } test "runtime selector parser tracks combinator chain and grouping" { const alloc = std.testing.allocator; var sel = try compileRuntimeImpl(alloc, "a b > c + d ~ e, #x"); defer sel.deinit(alloc); - - try std.testing.expectEqual(@as(usize, 2), sel.groups.len); - try std.testing.expectEqual(@as(usize, 6), sel.compounds.len); - - try std.testing.expect(sel.compounds[0].combinator == .none); - try std.testing.expect(sel.compounds[1].combinator == .descendant); - try std.testing.expect(sel.compounds[2].combinator == .child); - try std.testing.expect(sel.compounds[3].combinator == .adjacent); - try std.testing.expect(sel.compounds[4].combinator == .sibling); - try std.testing.expect(sel.compounds[5].combinator == .none); + try test_helpers.expectCombinatorChain(sel); } test "runtime selector parser supports leading combinator and pseudo-only compounds" { @@ -565,7 +519,7 @@ test "runtime selector parser supports leading combinator and pseudo-only compou try std.testing.expectEqual(@as(usize, 1), sel.groups.len); try std.testing.expectEqual(@as(usize, 1), sel.compounds.len); try std.testing.expect(sel.compounds[0].combinator == .child); - try std.testing.expect(sel.compounds[0].has_id == 1); + try std.testing.expect(!sel.compounds[0].id.isEmpty()); var sel2 = try compileRuntimeImpl(alloc, "#pseudos :nth-child(odd)"); defer sel2.deinit(alloc); diff --git a/src/selector/test_helpers.zig b/src/selector/test_helpers.zig new file mode 100644 index 0000000..f04aa61 --- /dev/null +++ b/src/selector/test_helpers.zig @@ -0,0 +1,28 @@ +const std = @import("std"); + +pub fn expectAllAttributeOps(sel: anytype) !void { + try std.testing.expectEqual(@as(usize, 1), sel.groups.len); + try std.testing.expectEqual(@as(usize, 1), sel.compounds.len); + + const comp = sel.compounds[0]; + try std.testing.expectEqual(@as(u32, 7), comp.attr_len); + try std.testing.expect(sel.attrs[comp.attr_start + 0].op == .exists); + try std.testing.expect(sel.attrs[comp.attr_start + 1].op == .eq); + try std.testing.expect(sel.attrs[comp.attr_start + 2].op == .prefix); + try std.testing.expect(sel.attrs[comp.attr_start + 3].op == .suffix); + try std.testing.expect(sel.attrs[comp.attr_start + 4].op == .contains); + try std.testing.expect(sel.attrs[comp.attr_start + 5].op == .includes); + try std.testing.expect(sel.attrs[comp.attr_start + 6].op == .dash_match); +} + +pub fn expectCombinatorChain(sel: anytype) !void { + try std.testing.expectEqual(@as(usize, 2), sel.groups.len); + try std.testing.expectEqual(@as(usize, 6), sel.compounds.len); + + try std.testing.expect(sel.compounds[0].combinator == .none); + try std.testing.expect(sel.compounds[1].combinator == .descendant); + try std.testing.expect(sel.compounds[2].combinator == .child); + try std.testing.expect(sel.compounds[3].combinator == .adjacent); + try std.testing.expect(sel.compounds[4].combinator == .sibling); + try std.testing.expect(sel.compounds[5].combinator == .none); +} diff --git a/tools/bench/bench.zig b/tools/bench/bench.zig index 40481d6..fd3aa43 100644 --- a/tools/bench/bench.zig +++ b/tools/bench/bench.zig @@ -2,31 +2,20 @@ const std = @import("std"); const root = @import("htmlparser"); const default_options: root.ParseOptions = .{}; const Document = default_options.GetDocument(); +const parse_mode = @import("parse_mode"); +const ParseMode = parse_mode.ParseMode; -const BenchMode = enum { - strictest, - fastest, -}; - -fn parseMode(arg: []const u8) !BenchMode { - if (std.mem.eql(u8, arg, "strictest")) return .strictest; - if (std.mem.eql(u8, arg, "fastest")) return .fastest; - return error.InvalidBenchMode; +fn elapsedNs(start: i96, finish: i96) u64 { + if (finish <= start) return 0; + return @intCast(finish - start); } -fn parseDocForBench(noalias doc: *Document, input: []u8, mode: BenchMode) !void { - switch (mode) { - .strictest => try doc.parse(input, .{ - .drop_whitespace_text_nodes = false, - }), - .fastest => try doc.parse(input, .{ - .drop_whitespace_text_nodes = true, - }), - } +fn nowNs(io: std.Io) i96 { + return std.Io.Timestamp.now(io, .awake).toNanoseconds(); } /// Runs a built-in synthetic parse/query workload and prints elapsed ns. -pub fn runSynthetic() !void { +pub fn runSynthetic(io: std.Io) !void { const alloc = std.heap.smp_allocator; var doc = Document.init(alloc); @@ -34,29 +23,29 @@ pub fn runSynthetic() !void { var src = "".*; - const parse_start = std.time.nanoTimestamp(); + const parse_start = nowNs(io); var i: usize = 0; while (i < 10_000) : (i += 1) { try doc.parse(&src, .{}); } - const parse_end = std.time.nanoTimestamp(); + const parse_end = nowNs(io); - const query_start = std.time.nanoTimestamp(); + const query_start = nowNs(io); i = 0; while (i < 100_000) : (i += 1) { _ = doc.queryOne("li.x"); } - const query_end = std.time.nanoTimestamp(); + const query_end = nowNs(io); - std.debug.print("parse ns: {d}\n", .{parse_end - parse_start}); - std.debug.print("query ns: {d}\n", .{query_end - query_start}); + std.debug.print("parse ns: {d}\n", .{elapsedNs(parse_start, parse_end)}); + std.debug.print("query ns: {d}\n", .{elapsedNs(query_start, query_end)}); } /// Benchmarks parse throughput for one fixture and mode; returns total elapsed ns. -pub fn runParseFile(path: []const u8, iterations: usize, mode: BenchMode) !u64 { +pub fn runParseFile(io: std.Io, path: []const u8, iterations: usize, mode: ParseMode) !u64 { const alloc = std.heap.smp_allocator; - const input = try std.fs.cwd().readFileAlloc(alloc, path, std.math.maxInt(usize)); + const input = try std.Io.Dir.cwd().readFileAlloc(io, path, alloc, .unlimited); defer alloc.free(input); var working_opt: ?[]u8 = null; @@ -68,7 +57,7 @@ pub fn runParseFile(path: []const u8, iterations: usize, mode: BenchMode) !u64 { var parse_arena = std.heap.ArenaAllocator.init(alloc); defer parse_arena.deinit(); - const start = std.time.nanoTimestamp(); + const start = nowNs(io); var i: usize = 0; while (i < iterations) : (i += 1) { const iter_alloc = parse_arena.allocator(); @@ -77,41 +66,47 @@ pub fn runParseFile(path: []const u8, iterations: usize, mode: BenchMode) !u64 { defer doc.deinit(); if (working_opt) |working| { @memcpy(working, input); - try parseDocForBench(&doc, working, mode); + switch (mode) { + .strictest => try doc.parse(working, .{ .drop_whitespace_text_nodes = false }), + .fastest => try doc.parse(working, .{ .drop_whitespace_text_nodes = true }), + } } else { - try parseDocForBench(&doc, input, mode); + switch (mode) { + .strictest => try doc.parse(input, .{ .drop_whitespace_text_nodes = false }), + .fastest => try doc.parse(input, .{ .drop_whitespace_text_nodes = true }), + } } } _ = parse_arena.reset(.retain_capacity); } - const end = std.time.nanoTimestamp(); + const end = nowNs(io); - return @intCast(end - start); + return elapsedNs(start, end); } /// Benchmarks runtime selector parse cost; returns total elapsed ns. -pub fn runQueryParse(selector: []const u8, iterations: usize) !u64 { +pub fn runQueryParse(io: std.Io, selector: []const u8, iterations: usize) !u64 { const alloc = std.heap.smp_allocator; var arena = std.heap.ArenaAllocator.init(alloc); defer arena.deinit(); - const start = std.time.nanoTimestamp(); + const start = nowNs(io); var i: usize = 0; while (i < iterations) : (i += 1) { _ = arena.reset(.retain_capacity); _ = try root.Selector.compileRuntime(arena.allocator(), selector); } - const end = std.time.nanoTimestamp(); + const end = nowNs(io); - return @intCast(end - start); + return elapsedNs(start, end); } /// Benchmarks runtime query execution over a pre-parsed document. -pub fn runQueryMatch(path: []const u8, selector: []const u8, iterations: usize, mode: BenchMode) !u64 { +pub fn runQueryMatch(io: std.Io, path: []const u8, selector: []const u8, iterations: usize, mode: ParseMode) !u64 { const alloc = std.heap.smp_allocator; - const input = try std.fs.cwd().readFileAlloc(alloc, path, std.math.maxInt(usize)); + const input = try std.Io.Dir.cwd().readFileAlloc(io, path, alloc, .unlimited); defer alloc.free(input); const working = try alloc.dupe(u8, input); @@ -119,23 +114,26 @@ pub fn runQueryMatch(path: []const u8, selector: []const u8, iterations: usize, var doc = Document.init(alloc); defer doc.deinit(); - try parseDocForBench(&doc, working, mode); + switch (mode) { + .strictest => try doc.parse(working, .{ .drop_whitespace_text_nodes = false }), + .fastest => try doc.parse(working, .{ .drop_whitespace_text_nodes = true }), + } - const start = std.time.nanoTimestamp(); + const start = nowNs(io); var i: usize = 0; while (i < iterations) : (i += 1) { _ = doc.queryOneRuntime(selector) catch null; } - const end = std.time.nanoTimestamp(); + const end = nowNs(io); - return @intCast(end - start); + return elapsedNs(start, end); } /// Benchmarks cached-selector query execution over a pre-parsed document. -pub fn runQueryCached(path: []const u8, selector: []const u8, iterations: usize, mode: BenchMode) !u64 { +pub fn runQueryCached(io: std.Io, path: []const u8, selector: []const u8, iterations: usize, mode: ParseMode) !u64 { const alloc = std.heap.smp_allocator; - const input = try std.fs.cwd().readFileAlloc(alloc, path, std.math.maxInt(usize)); + const input = try std.Io.Dir.cwd().readFileAlloc(io, path, alloc, .unlimited); defer alloc.free(input); const working = try alloc.dupe(u8, input); @@ -148,71 +146,72 @@ pub fn runQueryCached(path: []const u8, selector: []const u8, iterations: usize, var doc = Document.init(alloc); defer doc.deinit(); - try parseDocForBench(&doc, working, mode); + switch (mode) { + .strictest => try doc.parse(working, .{ .drop_whitespace_text_nodes = false }), + .fastest => try doc.parse(working, .{ .drop_whitespace_text_nodes = true }), + } - const start = std.time.nanoTimestamp(); + const start = nowNs(io); var i: usize = 0; while (i < iterations) : (i += 1) { - _ = doc.queryOneCached(&sel); + _ = doc.queryOneCached(sel); } - const end = std.time.nanoTimestamp(); + const end = nowNs(io); - return @intCast(end - start); + return elapsedNs(start, end); } /// CLI entrypoint for parser/query benchmarking utilities. -pub fn main() !void { - const alloc = std.heap.smp_allocator; - - const args = try std.process.argsAlloc(alloc); - defer std.process.argsFree(alloc, args); +pub fn main(init: std.process.Init) !void { + const io = init.io; + const args = try init.minimal.args.toSlice(init.arena.allocator()); if (args.len == 1) { - try runSynthetic(); + try runSynthetic(io); return; } if (args.len == 4 and std.mem.eql(u8, args[1], "query-parse")) { const iterations = try std.fmt.parseInt(usize, args[3], 10); - const total_ns = try runQueryParse(args[2], iterations); + const total_ns = try runQueryParse(io, args[2], iterations); std.debug.print("{d}\n", .{total_ns}); return; } if (args.len == 5 and std.mem.eql(u8, args[1], "query-match")) { const iterations = try std.fmt.parseInt(usize, args[4], 10); - const total_ns = try runQueryMatch(args[2], args[3], iterations, .fastest); + const total_ns = try runQueryMatch(io, args[2], args[3], iterations, .fastest); std.debug.print("{d}\n", .{total_ns}); return; } if (args.len == 6 and std.mem.eql(u8, args[1], "query-match")) { - const mode = try parseMode(args[2]); + const mode = parse_mode.parseMode(args[2]) orelse return error.InvalidBenchMode; const iterations = try std.fmt.parseInt(usize, args[5], 10); - const total_ns = try runQueryMatch(args[3], args[4], iterations, mode); + const total_ns = try runQueryMatch(io, args[3], args[4], iterations, mode); std.debug.print("{d}\n", .{total_ns}); return; } if (args.len == 5 and std.mem.eql(u8, args[1], "query-cached")) { const iterations = try std.fmt.parseInt(usize, args[4], 10); - const total_ns = try runQueryCached(args[2], args[3], iterations, .fastest); + const total_ns = try runQueryCached(io, args[2], args[3], iterations, .fastest); std.debug.print("{d}\n", .{total_ns}); return; } if (args.len == 6 and std.mem.eql(u8, args[1], "query-cached")) { - const mode = try parseMode(args[2]); + const mode = parse_mode.parseMode(args[2]) orelse return error.InvalidBenchMode; const iterations = try std.fmt.parseInt(usize, args[5], 10); - const total_ns = try runQueryCached(args[3], args[4], iterations, mode); + const total_ns = try runQueryCached(io, args[3], args[4], iterations, mode); std.debug.print("{d}\n", .{total_ns}); return; } if (args.len == 5 and std.mem.eql(u8, args[1], "parse")) { - const mode = try parseMode(args[2]); + const mode = parse_mode.parseMode(args[2]) orelse return error.InvalidBenchMode; const iterations = try std.fmt.parseInt(usize, args[4], 10); - const total_ns = try runParseFile(args[3], iterations, mode); + const total_ns = try runParseFile(io, args[3], iterations, mode); std.debug.print("{d}\n", .{total_ns}); return; } @@ -226,6 +225,22 @@ pub fn main() !void { } const iterations = try std.fmt.parseInt(usize, args[2], 10); - const total_ns = try runParseFile(args[1], iterations, .fastest); + const total_ns = try runParseFile(io, args[1], iterations, .fastest); std.debug.print("{d}\n", .{total_ns}); } + +test "bench smoke uses parse_mode module for both parse modes" { + const alloc = std.testing.allocator; + + var fastest_doc = Document.init(alloc); + defer fastest_doc.deinit(); + var fastest_html = "
- 1
- 2
- 3
ok".*; + try fastest_doc.parse(&fastest_html, .{ .drop_whitespace_text_nodes = true }); + try std.testing.expect(fastest_doc.queryOne("span#x") != null); + + var strict_doc = Document.init(alloc); + defer strict_doc.deinit(); + var strict_html = "\n ok\n".*; + try strict_doc.parse(&strict_html, .{ .drop_whitespace_text_nodes = false }); + try std.testing.expect(strict_doc.queryOne("span#y") != null); +} diff --git a/tools/common.zig b/tools/common.zig index eca4616..0f0fe9a 100644 --- a/tools/common.zig +++ b/tools/common.zig @@ -1,14 +1,18 @@ const std = @import("std"); /// Returns true when `path` exists relative to the current working directory. -pub fn fileExists(path: []const u8) bool { - std.fs.cwd().access(path, .{}) catch return false; +pub fn fileExists(io: std.Io, path: []const u8) bool { + if (std.fs.path.isAbsolute(path)) { + std.Io.Dir.accessAbsolute(io, path, .{}) catch return false; + } else { + std.Io.Dir.cwd().access(io, path, .{}) catch return false; + } return true; } /// Creates `path` and any missing parent directories. -pub fn ensureDir(path: []const u8) !void { - try std.fs.cwd().makePath(path); +pub fn ensureDir(io: std.Io, path: []const u8) !void { + try std.Io.Dir.cwd().createDirPath(io, path); } /// Renders argv-like tokens as a shell-style debug string. @@ -30,36 +34,36 @@ pub fn joinArgs(alloc: std.mem.Allocator, argv: []const []const u8) ![]u8 { } /// Runs a child process inheriting stdio, returning error on non-zero exit. -pub fn runInherit(alloc: std.mem.Allocator, argv: []const []const u8, cwd: ?[]const u8) !void { +pub fn runInherit(io: std.Io, alloc: std.mem.Allocator, argv: []const []const u8, cwd: ?[]const u8) !void { const pretty = try joinArgs(alloc, argv); defer alloc.free(pretty); std.debug.print("{s}\n", .{pretty}); - var child = std.process.Child.init(argv, alloc); - child.cwd = cwd; - child.stdin_behavior = .Inherit; - child.stdout_behavior = .Inherit; - child.stderr_behavior = .Inherit; - try child.spawn(); - const term = try child.wait(); + var child = try std.process.spawn(io, .{ + .argv = argv, + .cwd = if (cwd) |path| .{ .path = path } else .inherit, + .stdin = .inherit, + .stdout = .inherit, + .stderr = .inherit, + }); + const term = try child.wait(io); switch (term) { - .Exited => |code| if (code != 0) return error.ChildProcessFailed, + .exited => |code| if (code != 0) return error.ChildProcessFailed, else => return error.ChildProcessFailed, } } /// Runs a child process and returns combined stdout/stderr output. -pub fn runCaptureCombined(alloc: std.mem.Allocator, argv: []const []const u8, cwd: ?[]const u8) ![]u8 { - const res = try std.process.Child.run(.{ - .allocator = alloc, +pub fn runCaptureCombined(io: std.Io, alloc: std.mem.Allocator, argv: []const []const u8, cwd: ?[]const u8) ![]u8 { + const res = try std.process.run(alloc, io, .{ .argv = argv, - .cwd = cwd, + .cwd = if (cwd) |path| .{ .path = path } else .inherit, }); defer alloc.free(res.stdout); defer alloc.free(res.stderr); switch (res.term) { - .Exited => |code| if (code != 0) return error.ChildProcessFailed, + .exited => |code| if (code != 0) return error.ChildProcessFailed, else => return error.ChildProcessFailed, } @@ -74,17 +78,16 @@ pub fn runCaptureCombined(alloc: std.mem.Allocator, argv: []const []const u8, cw } /// Runs a child process and returns trimmed stdout output. -pub fn runCaptureStdout(alloc: std.mem.Allocator, argv: []const []const u8, cwd: ?[]const u8) ![]u8 { - const res = try std.process.Child.run(.{ - .allocator = alloc, +pub fn runCaptureStdout(io: std.Io, alloc: std.mem.Allocator, argv: []const []const u8, cwd: ?[]const u8) ![]u8 { + const res = try std.process.run(alloc, io, .{ .argv = argv, - .cwd = cwd, + .cwd = if (cwd) |path| .{ .path = path } else .inherit, }); defer alloc.free(res.stdout); defer alloc.free(res.stderr); switch (res.term) { - .Exited => |code| if (code != 0) return error.ChildProcessFailed, + .exited => |code| if (code != 0) return error.ChildProcessFailed, else => return error.ChildProcessFailed, } return alloc.dupe(u8, std.mem.trim(u8, res.stdout, " \r\n\t")); @@ -113,18 +116,18 @@ pub fn medianU64(alloc: std.mem.Allocator, vals: []const u64) !u64 { } /// Writes `bytes` to `path`, truncating any existing file. -pub fn writeFile(path: []const u8, bytes: []const u8) !void { - const file = try std.fs.cwd().createFile(path, .{ .truncate = true }); - defer file.close(); - try file.writeAll(bytes); +pub fn writeFile(io: std.Io, path: []const u8, bytes: []const u8) !void { + const file = try std.Io.Dir.cwd().createFile(io, path, .{ .truncate = true }); + defer file.close(io); + try file.writeStreamingAll(io, bytes); } /// Reads an entire file into allocator-owned memory. -pub fn readFileAlloc(alloc: std.mem.Allocator, path: []const u8) ![]u8 { - return try std.fs.cwd().readFileAlloc(alloc, path, std.math.maxInt(usize)); +pub fn readFileAlloc(io: std.Io, alloc: std.mem.Allocator, path: []const u8) ![]u8 { + return try std.Io.Dir.cwd().readFileAlloc(io, path, alloc, .unlimited); } /// Returns current UNIX timestamp in seconds. -pub fn nowUnix() i64 { - return std.time.timestamp(); +pub fn nowUnix(io: std.Io) i64 { + return std.Io.Timestamp.now(io, .real).toSeconds(); } diff --git a/tools/parse_mode.zig b/tools/parse_mode.zig new file mode 100644 index 0000000..7f08c98 --- /dev/null +++ b/tools/parse_mode.zig @@ -0,0 +1,15 @@ +const std = @import("std"); +const html = @import("htmlparser"); +const default_options: html.ParseOptions = .{}; +const Document = default_options.GetDocument(); + +pub const ParseMode = enum { + strictest, + fastest, +}; + +pub fn parseMode(s: []const u8) ?ParseMode { + if (std.mem.eql(u8, s, "strictest")) return .strictest; + if (std.mem.eql(u8, s, "fastest")) return .fastest; + return null; +} diff --git a/tools/scripts.zig b/tools/scripts.zig index 9ab7047..fd56297 100644 --- a/tools/scripts.zig +++ b/tools/scripts.zig @@ -164,12 +164,12 @@ fn getProfile(name: []const u8) !Profile { return error.InvalidProfile; } -fn pathExists(path: []const u8) bool { - return common.fileExists(path); +fn pathExists(io: std.Io, path: []const u8) bool { + return common.fileExists(io, path); } -fn setupParsers(alloc: std.mem.Allocator) !void { - try common.ensureDir(PARSERS_DIR); +fn setupParsers(io: std.Io, alloc: std.mem.Allocator) !void { + try common.ensureDir(io, PARSERS_DIR); const repos = [_]struct { url: []const u8, dir: []const u8 }{ .{ .url = "https://github.com/lexbor/lexbor.git", .dir = "lexbor" }, .{ .url = "https://github.com/cloudflare/lol-html.git", .dir = "lol-html" }, @@ -177,7 +177,7 @@ fn setupParsers(alloc: std.mem.Allocator) !void { for (repos) |repo| { const git_path = try std.fmt.allocPrint(alloc, "{s}/{s}/.git", .{ PARSERS_DIR, repo.dir }); defer alloc.free(git_path); - if (pathExists(git_path)) { + if (pathExists(io, git_path)) { std.debug.print("already present: {s}\n", .{repo.dir}); continue; } @@ -185,13 +185,13 @@ fn setupParsers(alloc: std.mem.Allocator) !void { const dst = try std.fmt.allocPrint(alloc, "{s}/{s}", .{ PARSERS_DIR, repo.dir }); defer alloc.free(dst); const argv = [_][]const u8{ "git", "clone", "--depth", "1", repo.url, dst }; - try common.runInherit(alloc, &argv, REPO_ROOT); + try common.runInherit(io, alloc, &argv, REPO_ROOT); } std.debug.print("done\n", .{}); } -fn setupFixtures(alloc: std.mem.Allocator, refresh: bool) !void { - try common.ensureDir(FIXTURES_DIR); +fn setupFixtures(io: std.Io, alloc: std.mem.Allocator, refresh: bool) !void { + try common.ensureDir(io, FIXTURES_DIR); const targets = [_]struct { url: []const u8, out: []const u8 }{ .{ .url = "https://www.rust-lang.org/", .out = "rust-lang.html" }, .{ .url = "https://en.wikipedia.org/wiki/HTML", .out = "wiki-html.html" }, @@ -211,7 +211,7 @@ fn setupFixtures(alloc: std.mem.Allocator, refresh: bool) !void { defer alloc.free(target); if (!refresh) { - const stat = std.fs.cwd().statFile(target) catch null; + const stat = std.Io.Dir.cwd().statFile(io, target, .{}) catch null; if (stat != null and stat.?.size > 0) { std.debug.print("cached: {s}\n", .{item.out}); continue; @@ -234,17 +234,17 @@ fn setupFixtures(alloc: std.mem.Allocator, refresh: bool) !void { "-o", target, }; - try common.runInherit(alloc, &argv, REPO_ROOT); + try common.runInherit(io, alloc, &argv, REPO_ROOT); } std.debug.print("fixtures ready in {s}\n", .{FIXTURES_DIR}); } -fn ensureExternalParsersBuilt(alloc: std.mem.Allocator) !void { - if (!pathExists("bench/parsers/lol-html/Cargo.toml")) { - try setupParsers(alloc); +fn ensureExternalParsersBuilt(io: std.Io, alloc: std.mem.Allocator) !void { + if (!pathExists(io, "bench/parsers/lol-html/Cargo.toml")) { + try setupParsers(io, alloc); } - if (!pathExists("bench/build/lexbor/liblexbor_static.a")) { + if (!pathExists(io, "bench/build/lexbor/liblexbor_static.a")) { const cmake_cfg = [_][]const u8{ "cmake", "-S", @@ -255,16 +255,16 @@ fn ensureExternalParsersBuilt(alloc: std.mem.Allocator) !void { "-DLEXBOR_BUILD_TESTS=OFF", "-DLEXBOR_BUILD_EXAMPLES=OFF", }; - try common.runInherit(alloc, &cmake_cfg, REPO_ROOT); + try common.runInherit(io, alloc, &cmake_cfg, REPO_ROOT); const cmake_build = [_][]const u8{ "cmake", "--build", "bench/build/lexbor", "-j" }; - try common.runInherit(alloc, &cmake_build, REPO_ROOT); + try common.runInherit(io, alloc, &cmake_build, REPO_ROOT); } } -fn buildRunners(alloc: std.mem.Allocator) !void { - try common.ensureDir(BIN_DIR); +fn buildRunners(io: std.Io, alloc: std.mem.Allocator) !void { + try common.ensureDir(io, BIN_DIR); const zig_build = [_][]const u8{ "zig", "build", "-Doptimize=ReleaseFast" }; - try common.runInherit(alloc, &zig_build, REPO_ROOT); + try common.runInherit(io, alloc, &zig_build, REPO_ROOT); const strlen_cc = [_][]const u8{ "cc", @@ -274,7 +274,7 @@ fn buildRunners(alloc: std.mem.Allocator) !void { "-o", "bench/build/bin/strlen_runner", }; - try common.runInherit(alloc, &strlen_cc, REPO_ROOT); + try common.runInherit(io, alloc, &strlen_cc, REPO_ROOT); const lexbor_cc = [_][]const u8{ "cc", @@ -286,7 +286,7 @@ fn buildRunners(alloc: std.mem.Allocator) !void { "-o", "bench/build/bin/lexbor_runner", }; - try common.runInherit(alloc, &lexbor_cc, REPO_ROOT); + try common.runInherit(io, alloc, &lexbor_cc, REPO_ROOT); const cargo_lol = [_][]const u8{ "cargo", @@ -295,7 +295,7 @@ fn buildRunners(alloc: std.mem.Allocator) !void { "--manifest-path", "bench/runners/lol_html_runner/Cargo.toml", }; - try common.runInherit(alloc, &cargo_lol, REPO_ROOT); + try common.runInherit(io, alloc, &cargo_lol, REPO_ROOT); } const ParseResult = struct { @@ -309,7 +309,6 @@ const ParseResult = struct { const QueryResult = struct { parser: []const u8, - mode: []const u8, case: []const u8, selector: []const u8, fixture: ?[]const u8 = null, @@ -351,7 +350,6 @@ const ReadmeBenchSnapshot = struct { const ExternalSuiteCounts = struct { total: usize, passed: usize, - failed: usize, }; const ExternalSuiteMode = struct { @@ -414,10 +412,43 @@ fn freeArgv(alloc: std.mem.Allocator, argv: []const []const u8) void { alloc.free(argv); } -fn runIntCmd(alloc: std.mem.Allocator, argv: []const []const u8) !u64 { +fn freeParseSamples(alloc: std.mem.Allocator, results: []const ParseResult) void { + for (results) |row| { + alloc.free(row.samples_ns); + } +} + +fn freeQuerySamples(alloc: std.mem.Allocator, results: []const QueryResult) void { + for (results) |row| { + alloc.free(row.samples_ns); + } +} + +fn deinitOwnedStringList(alloc: std.mem.Allocator, list: *std.ArrayList([]const u8)) void { + for (list.items) |item| alloc.free(item); + list.deinit(alloc); +} + +fn deinitOwnedStringSet(alloc: std.mem.Allocator, set: *std.StringHashMap(void)) void { + var it = set.keyIterator(); + while (it.next()) |key_ptr| alloc.free(key_ptr.*); + set.deinit(); +} + +fn appendOwnedString(alloc: std.mem.Allocator, list: *std.ArrayList([]const u8), item: []const u8) !void { + errdefer alloc.free(item); + try list.append(alloc, item); +} + +fn putOwnedString(alloc: std.mem.Allocator, set: *std.StringHashMap(void), key: []const u8) !void { + errdefer alloc.free(key); + try set.put(key, {}); +} + +fn runIntCmd(io: std.Io, alloc: std.mem.Allocator, argv: []const []const u8) !u64 { const taskset_path: ?[]const u8 = blk: { - if (common.fileExists("/usr/bin/taskset")) break :blk "/usr/bin/taskset"; - if (common.fileExists("/bin/taskset")) break :blk "/bin/taskset"; + if (common.fileExists(io, "/usr/bin/taskset")) break :blk "/usr/bin/taskset"; + if (common.fileExists(io, "/bin/taskset")) break :blk "/bin/taskset"; break :blk null; }; @@ -431,28 +462,28 @@ fn runIntCmd(alloc: std.mem.Allocator, argv: []const []const u8) !u64 { } else argv; defer if (run_argv.ptr != argv.ptr) alloc.free(run_argv); - const out = try common.runCaptureCombined(alloc, run_argv, REPO_ROOT); + const out = try common.runCaptureCombined(io, alloc, run_argv, REPO_ROOT); defer alloc.free(out); return common.parseLastInt(out); } -fn benchParseOne(alloc: std.mem.Allocator, parser_name: []const u8, fixture_name: []const u8, iterations: usize) !ParseResult { +fn benchParseOne(io: std.Io, alloc: std.mem.Allocator, parser_name: []const u8, fixture_name: []const u8, iterations: usize) !ParseResult { const fixture = try std.fmt.allocPrint(alloc, "{s}/{s}", .{ FIXTURES_DIR, fixture_name }); defer alloc.free(fixture); - const stat = try std.fs.cwd().statFile(fixture); + const stat = try std.Io.Dir.cwd().statFile(io, fixture, .{}); const size_bytes = stat.size; { const warm = try runnerCmdParse(alloc, parser_name, fixture, 1); defer freeArgv(alloc, warm); - _ = try runIntCmd(alloc, warm); + _ = try runIntCmd(io, alloc, warm); } const samples = try alloc.alloc(u64, repeats); for (samples) |*slot| { const argv = try runnerCmdParse(alloc, parser_name, fixture, iterations); defer freeArgv(alloc, argv); - slot.* = try runIntCmd(alloc, argv); + slot.* = try runIntCmd(io, alloc, argv); } const median_ns = try common.medianU64(alloc, samples); @@ -469,19 +500,19 @@ fn benchParseOne(alloc: std.mem.Allocator, parser_name: []const u8, fixture_name }; } -fn benchQueryParseOne(alloc: std.mem.Allocator, parser_name: []const u8, case_name: []const u8, selector: []const u8, iterations: usize) !QueryResult { +fn benchQueryParseOne(io: std.Io, alloc: std.mem.Allocator, parser_name: []const u8, case_name: []const u8, selector: []const u8, iterations: usize) !QueryResult { const iter_s = try std.fmt.allocPrint(alloc, "{d}", .{iterations}); defer alloc.free(iter_s); { const warm = [_][]const u8{ "zig-out/bin/htmlparser-bench", "query-parse", selector, "1" }; - _ = try runIntCmd(alloc, &warm); + _ = try runIntCmd(io, alloc, &warm); } const samples = try alloc.alloc(u64, repeats); for (samples) |*slot| { const argv = [_][]const u8{ "zig-out/bin/htmlparser-bench", "query-parse", selector, iter_s }; - slot.* = try runIntCmd(alloc, &argv); + slot.* = try runIntCmd(io, alloc, &argv); } const median_ns = try common.medianU64(alloc, samples); @@ -490,7 +521,6 @@ fn benchQueryParseOne(alloc: std.mem.Allocator, parser_name: []const u8, case_na const ns_per_op = @as(f64, @floatFromInt(median_ns)) / @as(f64, @floatFromInt(iterations)); return .{ .parser = parser_name, - .mode = "runtime", .case = case_name, .selector = selector, .iterations = iterations, @@ -501,7 +531,7 @@ fn benchQueryParseOne(alloc: std.mem.Allocator, parser_name: []const u8, case_na }; } -fn benchQueryExecOne(alloc: std.mem.Allocator, parser_name: []const u8, mode: []const u8, case_name: []const u8, fixture_name: []const u8, selector: []const u8, iterations: usize, cached: bool) !QueryResult { +fn benchQueryExecOne(io: std.Io, alloc: std.mem.Allocator, parser_name: []const u8, mode: []const u8, case_name: []const u8, fixture_name: []const u8, selector: []const u8, iterations: usize, cached: bool) !QueryResult { const fixture = try std.fmt.allocPrint(alloc, "{s}/{s}", .{ FIXTURES_DIR, fixture_name }); defer alloc.free(fixture); const iter_s = try std.fmt.allocPrint(alloc, "{d}", .{iterations}); @@ -510,13 +540,13 @@ fn benchQueryExecOne(alloc: std.mem.Allocator, parser_name: []const u8, mode: [] { const warm = [_][]const u8{ "zig-out/bin/htmlparser-bench", sub, mode, fixture, selector, "1" }; - _ = try runIntCmd(alloc, &warm); + _ = try runIntCmd(io, alloc, &warm); } const samples = try alloc.alloc(u64, repeats); for (samples) |*slot| { const argv = [_][]const u8{ "zig-out/bin/htmlparser-bench", sub, mode, fixture, selector, iter_s }; - slot.* = try runIntCmd(alloc, &argv); + slot.* = try runIntCmd(io, alloc, &argv); } const median_ns = try common.medianU64(alloc, samples); const seconds = @as(f64, @floatFromInt(median_ns)) / 1_000_000_000.0; @@ -524,7 +554,6 @@ fn benchQueryExecOne(alloc: std.mem.Allocator, parser_name: []const u8, mode: [] const ns_per_op = @as(f64, @floatFromInt(median_ns)) / @as(f64, @floatFromInt(iterations)); return .{ .parser = parser_name, - .mode = mode, .case = case_name, .selector = selector, .fixture = fixture_name, @@ -568,42 +597,48 @@ fn findReadmeQuery(rows: []const ReadmeQueryResult, parser_name: []const u8, cas return null; } -fn appendUniqueString(list: *std.ArrayList([]const u8), alloc: std.mem.Allocator, value: []const u8) !void { - for (list.items) |it| { - if (std.mem.eql(u8, it, value)) return; - } - try list.append(alloc, value); -} - -fn writeMaybeF64(w: anytype, value: ?f64) !void { - if (value) |v| { - try w.print("{d:.2}", .{v}); - } else { - try w.writeAll("-"); - } -} - fn renderDocumentationBenchmarkSection(alloc: std.mem.Allocator, snap: ReadmeBenchSnapshot) ![]u8 { - var out = std.ArrayList(u8).empty; - errdefer out.deinit(alloc); - const w = out.writer(alloc); + var out: std.Io.Writer.Allocating = .init(alloc); + errdefer out.deinit(); + const w = &out.writer; var fixtures = std.ArrayList([]const u8).empty; defer fixtures.deinit(alloc); for (snap.parse_results) |row| { - try appendUniqueString(&fixtures, alloc, row.fixture); + var seen = false; + for (fixtures.items) |it| { + if (std.mem.eql(u8, it, row.fixture)) { + seen = true; + break; + } + } + if (!seen) try fixtures.append(alloc, row.fixture); } var query_match_cases = std.ArrayList([]const u8).empty; defer query_match_cases.deinit(alloc); for (snap.query_match_results) |row| { - try appendUniqueString(&query_match_cases, alloc, row.case); + var seen = false; + for (query_match_cases.items) |it| { + if (std.mem.eql(u8, it, row.case)) { + seen = true; + break; + } + } + if (!seen) try query_match_cases.append(alloc, row.case); } var query_parse_cases = std.ArrayList([]const u8).empty; defer query_parse_cases.deinit(alloc); for (snap.query_parse_results) |row| { - try appendUniqueString(&query_parse_cases, alloc, row.case); + var seen = false; + for (query_parse_cases.items) |it| { + if (std.mem.eql(u8, it, row.case)) { + seen = true; + break; + } + } + if (!seen) try query_parse_cases.append(alloc, row.case); } try w.print("Source: `bench/results/latest.json` (`{s}` profile).\n\n", .{snap.profile}); @@ -613,11 +648,23 @@ fn renderDocumentationBenchmarkSection(alloc: std.mem.Allocator, snap: ReadmeBen try w.writeAll("|---|---:|---:|---:|\n"); for (fixtures.items) |fixture| { try w.print("| `{s}` | ", .{fixture}); - try writeMaybeF64(w, findReadmeParseThroughput(snap.parse_results, "ours", fixture)); + if (findReadmeParseThroughput(snap.parse_results, "ours", fixture)) |v| { + try w.print("{d:.2}", .{v}); + } else { + try w.writeAll("-"); + } try w.writeAll(" | "); - try writeMaybeF64(w, findReadmeParseThroughput(snap.parse_results, "lol-html", fixture)); + if (findReadmeParseThroughput(snap.parse_results, "lol-html", fixture)) |v| { + try w.print("{d:.2}", .{v}); + } else { + try w.writeAll("-"); + } try w.writeAll(" | "); - try writeMaybeF64(w, findReadmeParseThroughput(snap.parse_results, "lexbor", fixture)); + if (findReadmeParseThroughput(snap.parse_results, "lexbor", fixture)) |v| { + try w.print("{d:.2}", .{v}); + } else { + try w.writeAll("-"); + } try w.writeAll(" |\n"); } @@ -627,9 +674,17 @@ fn renderDocumentationBenchmarkSection(alloc: std.mem.Allocator, snap: ReadmeBen for (query_match_cases.items) |case_name| { const ours = findReadmeQuery(snap.query_match_results, "ours", case_name); try w.print("| `{s}` | ", .{case_name}); - try writeMaybeF64(w, if (ours) |s| s.ops_s else null); + if (ours) |s| { + try w.print("{d:.2}", .{s.ops_s}); + } else { + try w.writeAll("-"); + } try w.writeAll(" | "); - try writeMaybeF64(w, if (ours) |s| s.ns_per_op else null); + if (ours) |s| { + try w.print("{d:.2}", .{s.ns_per_op}); + } else { + try w.writeAll("-"); + } try w.writeAll(" |\n"); } @@ -639,9 +694,17 @@ fn renderDocumentationBenchmarkSection(alloc: std.mem.Allocator, snap: ReadmeBen for (query_match_cases.items) |case_name| { const ours = findReadmeQuery(snap.query_cached_results, "ours", case_name); try w.print("| `{s}` | ", .{case_name}); - try writeMaybeF64(w, if (ours) |s| s.ops_s else null); + if (ours) |s| { + try w.print("{d:.2}", .{s.ops_s}); + } else { + try w.writeAll("-"); + } try w.writeAll(" | "); - try writeMaybeF64(w, if (ours) |s| s.ns_per_op else null); + if (ours) |s| { + try w.print("{d:.2}", .{s.ns_per_op}); + } else { + try w.writeAll("-"); + } try w.writeAll(" |\n"); } @@ -651,9 +714,17 @@ fn renderDocumentationBenchmarkSection(alloc: std.mem.Allocator, snap: ReadmeBen for (query_parse_cases.items) |case_name| { const ours = findReadmeQuery(snap.query_parse_results, "ours", case_name); try w.print("| `{s}` | ", .{case_name}); - try writeMaybeF64(w, if (ours) |r| r.ops_s else null); + if (ours) |r| { + try w.print("{d:.2}", .{r.ops_s}); + } else { + try w.writeAll("-"); + } try w.writeAll(" | "); - try writeMaybeF64(w, if (ours) |r| r.ns_per_op else null); + if (ours) |r| { + try w.print("{d:.2}", .{r.ns_per_op}); + } else { + try w.writeAll("-"); + } try w.writeAll(" |\n"); } @@ -661,11 +732,11 @@ fn renderDocumentationBenchmarkSection(alloc: std.mem.Allocator, snap: ReadmeBen try w.writeAll("- `bench/results/latest.md`\n"); try w.writeAll("- `bench/results/latest.json`\n"); - return out.toOwnedSlice(alloc); + return out.toOwnedSlice(); } -fn updateDocumentationBenchmarkSnapshot(alloc: std.mem.Allocator) !void { - const latest_json = try common.readFileAlloc(alloc, "bench/results/latest.json"); +fn updateDocumentationBenchmarkSnapshot(io: std.Io, alloc: std.mem.Allocator) !void { + const latest_json = try common.readFileAlloc(io, alloc, "bench/results/latest.json"); defer alloc.free(latest_json); const parsed = try std.json.parseFromSlice(ReadmeBenchSnapshot, alloc, latest_json, .{ @@ -676,7 +747,7 @@ fn updateDocumentationBenchmarkSnapshot(alloc: std.mem.Allocator) !void { const replacement = try renderDocumentationBenchmarkSection(alloc, parsed.value); defer alloc.free(replacement); - const documentation = try common.readFileAlloc(alloc, "DOCUMENTATION.md"); + const documentation = try common.readFileAlloc(io, alloc, "DOCUMENTATION.md"); defer alloc.free(documentation); const start = std.mem.indexOf(u8, documentation, DocumentationBenchmarkStartMarker) orelse return error.ReadmeBenchMarkersMissing; @@ -697,7 +768,7 @@ fn updateDocumentationBenchmarkSnapshot(alloc: std.mem.Allocator) !void { try out.appendSlice(alloc, documentation[end..]); if (!std.mem.eql(u8, out.items, documentation)) { - try common.writeFile("DOCUMENTATION.md", out.items); + try common.writeFile(io, "DOCUMENTATION.md", out.items); std.debug.print("wrote DOCUMENTATION.md benchmark snapshot\n", .{}); } else { std.debug.print("DOCUMENTATION.md benchmark snapshot already up-to-date\n", .{}); @@ -713,14 +784,6 @@ fn cmpParseAverageDesc(_: void, a: ParseAverageRow, b: ParseAverageRow) bool { return a.avg_mb_s > b.avg_mb_s; } -fn writeSpaces(w: anytype, count: usize) !void { - for (0..count) |_| try w.writeAll(" "); -} - -fn writeRepeatGlyph(w: anytype, glyph: []const u8, count: usize) !void { - for (0..count) |_| try w.writeAll(glyph); -} - fn parseAverageRows(alloc: std.mem.Allocator, snap: ReadmeBenchSnapshot) ![]ParseAverageRow { const parser_names = [_][]const u8{ "ours", "lol-html", "lexbor" }; var rows = std.ArrayList(ParseAverageRow).empty; @@ -745,6 +808,10 @@ fn parseAverageRows(alloc: std.mem.Allocator, snap: ReadmeBenchSnapshot) ![]Pars return rows.toOwnedSlice(alloc); } +fn failedCount(summary: anytype) usize { + return summary.total - summary.passed; +} + fn writeConformanceRow( w: anytype, profile: []const u8, @@ -757,27 +824,27 @@ fn writeConformanceRow( profile, nw.passed, nw.total, - nw.failed, + failedCount(nw), qw.passed, qw.total, - qw.failed, + failedCount(qw), html5lib.passed, html5lib.total, - html5lib.failed, + failedCount(html5lib), whatwg.passed, whatwg.total, - whatwg.failed, + failedCount(whatwg), }); } fn parserHtml5libCounts(mode: ExternalSuiteMode) ExternalSuiteCounts { if (mode.parser_suites) |s| return s.html5lib_subset; - return .{ .total = 0, .passed = 0, .failed = 0 }; + return .{ .total = 0, .passed = 0 }; } fn parserWhatwgCounts(mode: ExternalSuiteMode) ExternalSuiteCounts { if (mode.parser_suites) |s| return s.whatwg_html_parsing; - return .{ .total = 0, .passed = 0, .failed = 0 }; + return .{ .total = 0, .passed = 0 }; } fn sameExternalMode(a: ExternalSuiteMode, b: ExternalSuiteMode) bool { @@ -788,26 +855,22 @@ fn sameExternalMode(a: ExternalSuiteMode, b: ExternalSuiteMode) bool { return a.selector_suites.nwmatcher.total == b.selector_suites.nwmatcher.total and a.selector_suites.nwmatcher.passed == b.selector_suites.nwmatcher.passed and - a.selector_suites.nwmatcher.failed == b.selector_suites.nwmatcher.failed and a.selector_suites.qwery_contextual.total == b.selector_suites.qwery_contextual.total and a.selector_suites.qwery_contextual.passed == b.selector_suites.qwery_contextual.passed and - a.selector_suites.qwery_contextual.failed == b.selector_suites.qwery_contextual.failed and a_html5.total == b_html5.total and a_html5.passed == b_html5.passed and - a_html5.failed == b_html5.failed and a_whatwg.total == b_whatwg.total and - a_whatwg.passed == b_whatwg.passed and - a_whatwg.failed == b_whatwg.failed; + a_whatwg.passed == b_whatwg.passed; } -fn renderReadmeAutoSummary(alloc: std.mem.Allocator) ![]u8 { - var out = std.ArrayList(u8).empty; - errdefer out.deinit(alloc); - const w = out.writer(alloc); +fn renderReadmeAutoSummary(io: std.Io, alloc: std.mem.Allocator) ![]u8 { + var out: std.Io.Writer.Allocating = .init(alloc); + errdefer out.deinit(); + const w = &out.writer; - const latest_exists = common.fileExists("bench/results/latest.json"); + const latest_exists = common.fileExists(io, "bench/results/latest.json"); if (latest_exists) { - const latest_json = try common.readFileAlloc(alloc, "bench/results/latest.json"); + const latest_json = try common.readFileAlloc(io, alloc, "bench/results/latest.json"); defer alloc.free(latest_json); const parsed = try std.json.parseFromSlice(ReadmeBenchSnapshot, alloc, latest_json, .{ .ignore_unknown_fields = true, @@ -836,10 +899,16 @@ fn renderReadmeAutoSummary(alloc: std.mem.Allocator) ![]u8 { else @as(usize, 0); try w.writeAll(r.parser); - try writeSpaces(w, max_name_len - r.parser.len); + for (0..max_name_len - r.parser.len) |_| { + try w.writeByte(' '); + } try w.writeAll(" │"); - try writeRepeatGlyph(w, "█", filled); - try writeRepeatGlyph(w, "░", width - filled); + for (0..filled) |_| { + try w.writeAll("█"); + } + for (0..(width - filled)) |_| { + try w.writeAll("░"); + } try w.print("│ {d:.2} MB/s ({d:.2}%)\n", .{ r.avg_mb_s, pct }); } try w.writeAll("```\n"); @@ -848,8 +917,8 @@ fn renderReadmeAutoSummary(alloc: std.mem.Allocator) ![]u8 { } try w.writeAll("\n### Conformance Snapshot\n\n"); - if (common.fileExists("bench/results/external_suite_report.json")) { - const ext_json = try common.readFileAlloc(alloc, "bench/results/external_suite_report.json"); + if (common.fileExists(io, "bench/results/external_suite_report.json")) { + const ext_json = try common.readFileAlloc(io, alloc, "bench/results/external_suite_report.json"); defer alloc.free(ext_json); const parsed_ext = try std.json.parseFromSlice(ExternalSuiteReport, alloc, ext_json, .{ .ignore_unknown_fields = true, @@ -896,14 +965,14 @@ fn renderReadmeAutoSummary(alloc: std.mem.Allocator) ![]u8 { try w.writeAll("Run `zig build conformance` to generate conformance summary.\n"); } - return out.toOwnedSlice(alloc); + return out.toOwnedSlice(); } -fn updateReadmeAutoSummary(alloc: std.mem.Allocator) !void { - const replacement = try renderReadmeAutoSummary(alloc); +fn updateReadmeAutoSummary(io: std.Io, alloc: std.mem.Allocator) !void { + const replacement = try renderReadmeAutoSummary(io, alloc); defer alloc.free(replacement); - const readme = try common.readFileAlloc(alloc, "README.md"); + const readme = try common.readFileAlloc(io, alloc, "README.md"); defer alloc.free(readme); const start = std.mem.indexOf(u8, readme, ReadmeSummaryStartMarker) orelse return error.ReadmeBenchMarkersMissing; @@ -924,7 +993,7 @@ fn updateReadmeAutoSummary(alloc: std.mem.Allocator) !void { try out.appendSlice(alloc, readme[end..]); if (!std.mem.eql(u8, out.items, readme)) { - try common.writeFile("README.md", out.items); + try common.writeFile(io, "README.md", out.items); std.debug.print("wrote README.md auto summary\n", .{}); } else { std.debug.print("README.md auto summary already up-to-date\n", .{}); @@ -932,6 +1001,7 @@ fn updateReadmeAutoSummary(alloc: std.mem.Allocator) !void { } fn writeMarkdown( + io: std.Io, alloc: std.mem.Allocator, profile_name: []const u8, parse_results: []const ParseResult, @@ -940,11 +1010,11 @@ fn writeMarkdown( query_cached_results: []const QueryResult, gate_rows: []const GateRow, ) ![]u8 { - var out = std.ArrayList(u8).empty; - errdefer out.deinit(alloc); - const w = out.writer(alloc); + var out: std.Io.Writer.Allocating = .init(alloc); + errdefer out.deinit(); + const w = &out.writer; - try w.print("# HTML Parser Benchmark Results\n\nGenerated (unix): {d}\n\nProfile: `{s}`\n\n", .{ common.nowUnix(), profile_name }); + try w.print("# HTML Parser Benchmark Results\n\nGenerated (unix): {d}\n\nProfile: `{s}`\n\n", .{ common.nowUnix(io), profile_name }); try w.writeAll("## Parse Throughput\n\n"); var seen = std.StringHashMap(void).init(alloc); @@ -993,9 +1063,9 @@ fn writeMarkdown( try w.writeAll("\n"); } - try writeQuerySection(alloc, &out, "## Query Parse Throughput", query_parse_results); - try writeQuerySection(alloc, &out, "## Query Match Throughput", query_match_results); - try writeQuerySection(alloc, &out, "## Query Cached Throughput", query_cached_results); + try writeQuerySection(alloc, w, "## Query Parse Throughput", query_parse_results); + try writeQuerySection(alloc, w, "## Query Match Throughput", query_match_results); + try writeQuerySection(alloc, w, "## Query Cached Throughput", query_cached_results); if (gate_rows.len > 0) { try w.writeAll("## Ours vs lol-html Gate\n\n"); @@ -1012,11 +1082,10 @@ fn writeMarkdown( try w.writeAll("\n"); } - return out.toOwnedSlice(alloc); + return out.toOwnedSlice(); } -fn writeQuerySection(alloc: std.mem.Allocator, out: *std.ArrayList(u8), title: []const u8, rows: []const QueryResult) !void { - const w = out.writer(alloc); +fn writeQuerySection(alloc: std.mem.Allocator, w: *std.Io.Writer, title: []const u8, rows: []const QueryResult) !void { try w.print("{s}\n\n", .{title}); var seen = std.StringHashMap(void).init(alloc); defer seen.deinit(); @@ -1077,7 +1146,7 @@ fn fixtureIterations(profile: Profile, fixture: []const u8) usize { return 0; } -fn rerunFailedGateRows(alloc: std.mem.Allocator, profile: Profile, gate_rows: []GateRow) !void { +fn rerunFailedGateRows(io: std.Io, alloc: std.mem.Allocator, profile: Profile, gate_rows: []GateRow) !void { if (!std.mem.eql(u8, profile.name, "stable")) return; for (gate_rows) |*row| { @@ -1089,9 +1158,9 @@ fn rerunFailedGateRows(alloc: std.mem.Allocator, profile: Profile, gate_rows: [] std.debug.print("re-running flaky gate fixture {s} at {d} iters\n", .{ row.fixture, iters }); - const ours = try benchParseOne(alloc, "ours", row.fixture, iters); + const ours = try benchParseOne(io, alloc, "ours", row.fixture, iters); defer alloc.free(ours.samples_ns); - const lol = try benchParseOne(alloc, "lol-html", row.fixture, iters); + const lol = try benchParseOne(io, alloc, "lol-html", row.fixture, iters); defer alloc.free(lol.samples_ns); row.ours_mb_s = ours.throughput_mb_s; @@ -1101,6 +1170,7 @@ fn rerunFailedGateRows(alloc: std.mem.Allocator, profile: Profile, gate_rows: [] } fn renderConsole( + io: std.Io, alloc: std.mem.Allocator, profile_name: []const u8, parse_results: []const ParseResult, @@ -1109,12 +1179,12 @@ fn renderConsole( query_cached_results: []const QueryResult, gate_rows: []const GateRow, ) ![]u8 { - var out = std.ArrayList(u8).empty; - errdefer out.deinit(alloc); - const w = out.writer(alloc); + var out: std.Io.Writer.Allocating = .init(alloc); + errdefer out.deinit(); + const w = &out.writer; try w.writeAll("HTML Parser Benchmark Results\n"); - try w.print("Generated (unix): {d}\n", .{common.nowUnix()}); + try w.print("Generated (unix): {d}\n", .{common.nowUnix(io)}); try w.print("Profile: {s}\n\n", .{profile_name}); try w.writeAll("Parse Throughput\n\n"); @@ -1184,9 +1254,9 @@ fn renderConsole( try w.writeAll("\n"); } - try renderQueryConsoleSection(alloc, &out, "Query Parse Throughput", query_parse_results); - try renderQueryConsoleSection(alloc, &out, "Query Match Throughput", query_match_results); - try renderQueryConsoleSection(alloc, &out, "Query Cached Throughput", query_cached_results); + try renderQueryConsoleSection(alloc, w, "Query Parse Throughput", query_parse_results); + try renderQueryConsoleSection(alloc, w, "Query Match Throughput", query_match_results); + try renderQueryConsoleSection(alloc, w, "Query Cached Throughput", query_cached_results); if (gate_rows.len > 0) { try w.writeAll("Ours vs lol-html Gate\n\n"); @@ -1217,13 +1287,19 @@ fn renderConsole( try w.writeAll("\n"); } - return out.toOwnedSlice(alloc); + return out.toOwnedSlice(); +} + +fn writeByteNTimes(writer: anytype, byte: u8, n: usize) !void { + for (0..n) |_| { + try writer.writeByte(byte); + } } fn appendAsciiSep(writer: anytype, widths: []const usize) !void { try writer.writeAll("+-"); for (widths, 0..) |w, i| { - try writer.writeByteNTimes('-', w); + try writeByteNTimes(writer, '-', w); if (i + 1 == widths.len) { try writer.writeAll("-+\n"); } else { @@ -1238,11 +1314,11 @@ fn appendAsciiRow(writer: anytype, widths: []const usize, cells: []const []const const width = widths[i]; const pad = if (width > cell.len) width - cell.len else 0; if (right_align[i]) { - try writer.writeByteNTimes(' ', pad); + try writeByteNTimes(writer, ' ', pad); try writer.writeAll(cell); } else { try writer.writeAll(cell); - try writer.writeByteNTimes(' ', pad); + try writeByteNTimes(writer, ' ', pad); } if (i + 1 == cells.len) { try writer.writeAll(" |\n"); @@ -1252,8 +1328,7 @@ fn appendAsciiRow(writer: anytype, widths: []const usize, cells: []const []const } } -fn renderQueryConsoleSection(alloc: std.mem.Allocator, out: *std.ArrayList(u8), title: []const u8, rows: []const QueryResult) !void { - const w = out.writer(alloc); +fn renderQueryConsoleSection(alloc: std.mem.Allocator, w: *std.Io.Writer, title: []const u8, rows: []const QueryResult) !void { try w.print("{s}\n\n", .{title}); var seen_cases = std.StringHashMap(void).init(alloc); @@ -1309,7 +1384,7 @@ fn renderQueryConsoleSection(alloc: std.mem.Allocator, out: *std.ArrayList(u8), } } -fn runBenchmarks(alloc: std.mem.Allocator, args: []const []const u8) !void { +fn runBenchmarks(io: std.Io, alloc: std.mem.Allocator, args: []const [:0]const u8) !void { var profile_name: []const u8 = "quick"; var write_baseline = false; @@ -1329,55 +1404,67 @@ fn runBenchmarks(alloc: std.mem.Allocator, args: []const []const u8) !void { const profile = try getProfile(profile_name); - try common.ensureDir(BIN_DIR); - try common.ensureDir(RESULTS_DIR); - try ensureExternalParsersBuilt(alloc); - try buildRunners(alloc); + try common.ensureDir(io, BIN_DIR); + try common.ensureDir(io, RESULTS_DIR); + try ensureExternalParsersBuilt(io, alloc); + try buildRunners(io, alloc); var parse_results = std.ArrayList(ParseResult).empty; - defer parse_results.deinit(alloc); + defer { + freeParseSamples(alloc, parse_results.items); + parse_results.deinit(alloc); + } for (profile.fixtures) |fixture| { for (parse_parsers) |parser_name| { std.debug.print("benchmarking {s} on {s} ({d} iters)\n", .{ parser_name, fixture.name, fixture.iterations }); - const row = try benchParseOne(alloc, parser_name, fixture.name, fixture.iterations); + const row = try benchParseOne(io, alloc, parser_name, fixture.name, fixture.iterations); try parse_results.append(alloc, row); } } var query_parse_results = std.ArrayList(QueryResult).empty; - defer query_parse_results.deinit(alloc); + defer { + freeQuerySamples(alloc, query_parse_results.items); + query_parse_results.deinit(alloc); + } for (query_parse_modes) |qm| { for (profile.query_parse_cases) |qc| { std.debug.print("benchmarking query-parse {s} on {s} ({d} iters)\n", .{ qm.parser, qc.name, qc.iterations }); - const row = try benchQueryParseOne(alloc, qm.parser, qc.name, qc.selector, qc.iterations); + const row = try benchQueryParseOne(io, alloc, qm.parser, qc.name, qc.selector, qc.iterations); try query_parse_results.append(alloc, row); } } var query_match_results = std.ArrayList(QueryResult).empty; - defer query_match_results.deinit(alloc); + defer { + freeQuerySamples(alloc, query_match_results.items); + query_match_results.deinit(alloc); + } for (query_modes) |qm| { for (profile.query_match_cases) |qc| { std.debug.print("benchmarking query-match {s} on {s} ({d} iters)\n", .{ qm.parser, qc.name, qc.iterations }); - const row = try benchQueryExecOne(alloc, qm.parser, qm.mode, qc.name, qc.fixture, qc.selector, qc.iterations, false); + const row = try benchQueryExecOne(io, alloc, qm.parser, qm.mode, qc.name, qc.fixture, qc.selector, qc.iterations, false); try query_match_results.append(alloc, row); } } var query_cached_results = std.ArrayList(QueryResult).empty; - defer query_cached_results.deinit(alloc); + defer { + freeQuerySamples(alloc, query_cached_results.items); + query_cached_results.deinit(alloc); + } for (query_modes) |qm| { for (profile.query_cached_cases) |qc| { std.debug.print("benchmarking query-cached {s} on {s} ({d} iters)\n", .{ qm.parser, qc.name, qc.iterations }); - const row = try benchQueryExecOne(alloc, qm.parser, qm.mode, qc.name, qc.fixture, qc.selector, qc.iterations, true); + const row = try benchQueryExecOne(io, alloc, qm.parser, qm.mode, qc.name, qc.fixture, qc.selector, qc.iterations, true); try query_cached_results.append(alloc, row); } } const gate_rows = try evaluateGateRows(alloc, profile, parse_results.items); defer alloc.free(gate_rows); - try rerunFailedGateRows(alloc, profile, gate_rows); + try rerunFailedGateRows(io, alloc, profile, gate_rows); const json_out = struct { generated_unix: i64, @@ -1391,7 +1478,7 @@ fn runBenchmarks(alloc: std.mem.Allocator, args: []const []const u8) !void { query_cached_results: []const QueryResult, gate_summary: []const GateRow, }{ - .generated_unix = common.nowUnix(), + .generated_unix = common.nowUnix(io), .profile = profile.name, .repeats = repeats, .bench_modes = .{ .parse = &[_][]const u8{"ours"}, .query = &[_][]const u8{"ours"} }, @@ -1403,32 +1490,32 @@ fn runBenchmarks(alloc: std.mem.Allocator, args: []const []const u8) !void { .gate_summary = gate_rows, }; - var json_writer: std.io.Writer.Allocating = .init(alloc); + var json_writer: std.Io.Writer.Allocating = .init(alloc); defer json_writer.deinit(); var json_stream: std.json.Stringify = .{ .writer = &json_writer.writer, .options = .{ .whitespace = .indent_2 }, }; try json_stream.write(json_out); - try common.writeFile("bench/results/latest.json", json_writer.written()); + try common.writeFile(io, "bench/results/latest.json", json_writer.written()); - const md = try writeMarkdown(alloc, profile.name, parse_results.items, query_parse_results.items, query_match_results.items, query_cached_results.items, gate_rows); + const md = try writeMarkdown(io, alloc, profile.name, parse_results.items, query_parse_results.items, query_match_results.items, query_cached_results.items, gate_rows); defer alloc.free(md); - try common.writeFile("bench/results/latest.md", md); - try updateDocumentationBenchmarkSnapshot(alloc); - try updateReadmeAutoSummary(alloc); + try common.writeFile(io, "bench/results/latest.md", md); + try updateDocumentationBenchmarkSnapshot(io, alloc); + try updateReadmeAutoSummary(io, alloc); // Optional baseline behavior. const baseline_default = try std.fmt.allocPrint(alloc, "bench/results/baseline_{s}.json", .{profile.name}); defer alloc.free(baseline_default); if (write_baseline) { - try common.writeFile(baseline_default, json_writer.written()); + try common.writeFile(io, baseline_default, json_writer.written()); std.debug.print("wrote baseline {s}\n", .{baseline_default}); } var failures = std.ArrayList([]const u8).empty; - defer failures.deinit(alloc); + defer deinitOwnedStringList(alloc, &failures); for (gate_rows) |g| { if (std.mem.eql(u8, profile.name, "stable") and !g.pass) { @@ -1439,7 +1526,7 @@ fn runBenchmarks(alloc: std.mem.Allocator, args: []const []const u8) !void { std.debug.print("wrote bench/results/latest.json\n", .{}); std.debug.print("wrote bench/results/latest.md\n\n", .{}); - const console = try renderConsole(alloc, profile.name, parse_results.items, query_parse_results.items, query_match_results.items, query_cached_results.items, gate_rows); + const console = try renderConsole(io, alloc, profile.name, parse_results.items, query_parse_results.items, query_match_results.items, query_cached_results.items, gate_rows); defer alloc.free(console); std.debug.print("{s}\n", .{console}); if (failures.items.len > 0) { @@ -1465,14 +1552,12 @@ const QwCase = struct { const SelectorSuiteSummary = struct { total: usize, passed: usize, - failed: usize, examples: []const []const u8, }; const ParserSuiteSummary = struct { total: usize, passed: usize, - failed: usize, examples: []const []const u8, }; @@ -1522,9 +1607,9 @@ const ModeFailuresOut = struct { }, }; -fn ensureSuites(alloc: std.mem.Allocator) !void { - try common.ensureDir(SUITES_CACHE_DIR); - try common.ensureDir(SUITES_DIR); +fn ensureSuites(io: std.Io, alloc: std.mem.Allocator) !void { + try common.ensureDir(io, SUITES_CACHE_DIR); + try common.ensureDir(io, SUITES_DIR); const repos = [_]struct { name: []const u8, url: []const u8 }{ .{ .name = "html5lib-tests", .url = "https://github.com/html5lib/html5lib-tests.git" }, @@ -1536,25 +1621,25 @@ fn ensureSuites(alloc: std.mem.Allocator) !void { for (repos) |repo| { const cache_path = try std.fmt.allocPrint(alloc, "{s}/{s}", .{ SUITES_CACHE_DIR, repo.name }); defer alloc.free(cache_path); - if (!pathExists(cache_path)) { + if (!pathExists(io, cache_path)) { const clone_argv = [_][]const u8{ "git", "clone", "--depth", "1", repo.url, cache_path }; - try common.runInherit(alloc, &clone_argv, REPO_ROOT); + try common.runInherit(io, alloc, &clone_argv, REPO_ROOT); } else { const pull_argv = [_][]const u8{ "git", "-C", cache_path, "pull", "--ff-only" }; - common.runInherit(alloc, &pull_argv, REPO_ROOT) catch {}; + common.runInherit(io, alloc, &pull_argv, REPO_ROOT) catch {}; } const dst = try std.fmt.allocPrint(alloc, "{s}/{s}", .{ SUITES_DIR, repo.name }); defer alloc.free(dst); - if (!pathExists(dst)) { + if (!pathExists(io, dst)) { const work_clone_argv = [_][]const u8{ "git", "clone", "--depth", "1", cache_path, dst }; - try common.runInherit(alloc, &work_clone_argv, REPO_ROOT); + try common.runInherit(io, alloc, &work_clone_argv, REPO_ROOT); } } } -fn buildSuiteRunner(alloc: std.mem.Allocator) !void { - try common.ensureDir(BIN_DIR); +fn buildSuiteRunner(io: std.Io, alloc: std.mem.Allocator) !void { + try common.ensureDir(io, BIN_DIR); const root_mod = "-Mroot=tools/suite_runner.zig"; const html_mod = "-Mhtmlparser=src/root.zig"; const argv = [_][]const u8{ @@ -1568,39 +1653,41 @@ fn buildSuiteRunner(alloc: std.mem.Allocator) !void { "ReleaseFast", "-femit-bin=" ++ SUITE_RUNNER_BIN, }; - try common.runInherit(alloc, &argv, REPO_ROOT); + try common.runInherit(io, alloc, &argv, REPO_ROOT); } -fn runSelectorCount(alloc: std.mem.Allocator, mode: []const u8, fixture: []const u8, selector: []const u8) !usize { +fn runSelectorCount(io: std.Io, alloc: std.mem.Allocator, mode: []const u8, fixture: []const u8, selector: []const u8) !usize { const argv = [_][]const u8{ SUITE_RUNNER_BIN, "selector-count", mode, fixture, selector }; - const out = try common.runCaptureStdout(alloc, &argv, REPO_ROOT); + const out = try common.runCaptureStdout(io, alloc, &argv, REPO_ROOT); defer alloc.free(out); return std.fmt.parseInt(usize, out, 10); } -fn runSelectorCountScoped(alloc: std.mem.Allocator, mode: []const u8, fixture: []const u8, scope_tag: []const u8, selector: []const u8) !usize { +fn runSelectorCountScoped(io: std.Io, alloc: std.mem.Allocator, mode: []const u8, fixture: []const u8, scope_tag: []const u8, selector: []const u8) !usize { const argv = [_][]const u8{ SUITE_RUNNER_BIN, "selector-count-scope-tag", mode, fixture, scope_tag, selector }; - const out = try common.runCaptureStdout(alloc, &argv, REPO_ROOT); + const out = try common.runCaptureStdout(io, alloc, &argv, REPO_ROOT); defer alloc.free(out); return std.fmt.parseInt(usize, out, 10); } -fn runParseTagsFile(alloc: std.mem.Allocator, mode: []const u8, fixture: []const u8) ![]const u8 { +fn runParseTagsFile(io: std.Io, alloc: std.mem.Allocator, mode: []const u8, fixture: []const u8) ![]const u8 { const argv = [_][]const u8{ SUITE_RUNNER_BIN, "parse-tags-file", mode, fixture }; - return common.runCaptureStdout(alloc, &argv, REPO_ROOT); + return common.runCaptureStdout(io, alloc, &argv, REPO_ROOT); } -fn tempHtmlFile(alloc: std.mem.Allocator, html: []const u8) ![]u8 { - const r = std.crypto.random.int(u64); +fn tempHtmlFile(io: std.Io, alloc: std.mem.Allocator, html: []const u8) ![]u8 { + var src: std.Random.IoSource = .{ .io = io }; + const rng = src.interface(); + const r = rng.int(u64); const path = try std.fmt.allocPrint(alloc, "/tmp/htmlparser-suite-{x}.html", .{r}); - const f = try std.fs.createFileAbsolute(path, .{ .truncate = true }); - defer f.close(); - try f.writeAll(html); + const f = try std.Io.Dir.createFileAbsolute(io, path, .{ .truncate = true }); + defer f.close(io); + try f.writeStreamingAll(io, html); return path; } -fn loadNwCases(alloc: std.mem.Allocator) ![]NwCase { - const bytes = try common.readFileAlloc(alloc, CONFORMANCE_CASES_DIR ++ "/nwmatcher_cases.json"); +fn loadNwCases(io: std.Io, alloc: std.mem.Allocator) ![]NwCase { + const bytes = try common.readFileAlloc(io, alloc, CONFORMANCE_CASES_DIR ++ "/nwmatcher_cases.json"); defer alloc.free(bytes); const parsed = try std.json.parseFromSlice([]NwCase, alloc, bytes, .{}); defer parsed.deinit(); @@ -1614,8 +1701,8 @@ fn loadNwCases(alloc: std.mem.Allocator) ![]NwCase { return out; } -fn loadQwCases(alloc: std.mem.Allocator) ![]QwCase { - const bytes = try common.readFileAlloc(alloc, CONFORMANCE_CASES_DIR ++ "/qwery_cases.json"); +fn loadQwCases(io: std.Io, alloc: std.mem.Allocator) ![]QwCase { + const bytes = try common.readFileAlloc(io, alloc, CONFORMANCE_CASES_DIR ++ "/qwery_cases.json"); defer alloc.free(bytes); const parsed = try std.json.parseFromSlice([]QwCase, alloc, bytes, .{}); defer parsed.deinit(); @@ -1639,19 +1726,20 @@ fn dupeStringSlices(alloc: std.mem.Allocator, src: []const []const u8) ![]const return out; } -fn htmlPreview(alloc: std.mem.Allocator, html: []const u8) ![]const u8 { +fn htmlPreview(io: std.Io, alloc: std.mem.Allocator, html: []const u8) ![]const u8 { + _ = io; const max_preview: usize = 220; const clipped = html[0..@min(html.len, max_preview)]; return std.mem.replaceOwned(u8, alloc, clipped, "\n", "\\n"); } -fn runSelectorSuites(alloc: std.mem.Allocator, mode: []const u8) !SelectorSuitesResult { - const nw_cases = try loadNwCases(alloc); +fn runSelectorSuites(io: std.Io, alloc: std.mem.Allocator, mode: []const u8) !SelectorSuitesResult { + const nw_cases = try loadNwCases(io, alloc); defer { for (nw_cases) |c| alloc.free(c.selector); alloc.free(nw_cases); } - const qw_cases = try loadQwCases(alloc); + const qw_cases = try loadQwCases(io, alloc); defer { for (qw_cases) |c| { alloc.free(c.selector); @@ -1662,9 +1750,9 @@ fn runSelectorSuites(alloc: std.mem.Allocator, mode: []const u8) !SelectorSuites const nw_fixture = SUITES_DIR ++ "/css-select/test/fixtures/nwmatcher.html"; const qw_fixture = SUITES_DIR ++ "/css-select/test/fixtures/qwery.html"; - const qw_doc_html = try common.readFileAlloc(alloc, CONFORMANCE_CASES_DIR ++ "/qwery_doc.html"); + const qw_doc_html = try common.readFileAlloc(io, alloc, CONFORMANCE_CASES_DIR ++ "/qwery_doc.html"); defer alloc.free(qw_doc_html); - const qw_frag_html = try common.readFileAlloc(alloc, CONFORMANCE_CASES_DIR ++ "/qwery_frag.html"); + const qw_frag_html = try common.readFileAlloc(io, alloc, CONFORMANCE_CASES_DIR ++ "/qwery_frag.html"); defer alloc.free(qw_frag_html); var nw_passed: usize = 0; @@ -1674,7 +1762,7 @@ fn runSelectorSuites(alloc: std.mem.Allocator, mode: []const u8) !SelectorSuites defer nw_failures.deinit(alloc); for (nw_cases, 0..) |c, idx| { if (idx >= 140) break; - const got = runSelectorCount(alloc, mode, nw_fixture, c.selector) catch { + const got = runSelectorCount(io, alloc, mode, nw_fixture, c.selector) catch { const msg = try std.fmt.allocPrint(alloc, "{s} expected {d} got", .{ c.selector, c.expected }); if (nw_examples.items.len < 8) try nw_examples.append(alloc, msg); try nw_failures.append(alloc, .{ @@ -1713,7 +1801,7 @@ fn runSelectorSuites(alloc: std.mem.Allocator, mode: []const u8) !SelectorSuites for (qw_cases, 0..) |c, idx| { const got = blk: { if (std.mem.eql(u8, c.context, "document")) { - break :blk runSelectorCount(alloc, mode, qw_fixture, c.selector) catch { + break :blk runSelectorCount(io, alloc, mode, qw_fixture, c.selector) catch { if (qw_examples.items.len < 8) { const msg = try std.fmt.allocPrint(alloc, "{s} {s} expected {d} got ", .{ c.context, c.selector, c.expected }); try qw_examples.append(alloc, msg); @@ -1730,12 +1818,12 @@ fn runSelectorSuites(alloc: std.mem.Allocator, mode: []const u8) !SelectorSuites }; } const html = if (std.mem.eql(u8, c.context, "doc")) qw_doc_html else qw_frag_html; - const tmp = try tempHtmlFile(alloc, html); + const tmp = try tempHtmlFile(io, alloc, html); defer { - std.fs.deleteFileAbsolute(tmp) catch {}; + std.Io.Dir.deleteFileAbsolute(io, tmp) catch {}; alloc.free(tmp); } - break :blk runSelectorCountScoped(alloc, mode, tmp, "root", c.selector) catch { + break :blk runSelectorCountScoped(io, alloc, mode, tmp, "root", c.selector) catch { if (qw_examples.items.len < 8) { const msg = try std.fmt.allocPrint(alloc, "{s} {s} expected {d} got ", .{ c.context, c.selector, c.expected }); try qw_examples.append(alloc, msg); @@ -1774,13 +1862,11 @@ fn runSelectorSuites(alloc: std.mem.Allocator, mode: []const u8) !SelectorSuites .nw = .{ .total = @min(nw_cases.len, 140), .passed = nw_passed, - .failed = @min(nw_cases.len, 140) - nw_passed, .examples = try nw_examples.toOwnedSlice(alloc), }, .qw = .{ .total = qw_cases.len, .passed = qw_passed, - .failed = qw_cases.len - qw_passed, .examples = try qw_examples.toOwnedSlice(alloc), }, .nw_failures = try nw_failures.toOwnedSlice(alloc), @@ -1813,9 +1899,41 @@ const ParserCase = struct { expected: []const []const u8, }; -fn parseHtml5libDat(alloc: std.mem.Allocator, path: []const u8, out: *std.ArrayList(ParserCase)) !void { - const text = try common.readFileAlloc(alloc, path); +fn freeOwnedStringSlice(alloc: std.mem.Allocator, items: []const []const u8) void { + for (items) |item| alloc.free(item); + alloc.free(items); +} + +fn freeParserCase(alloc: std.mem.Allocator, c: ParserCase) void { + alloc.free(c.html); + freeOwnedStringSlice(alloc, c.expected); +} + +fn freeParserCases(alloc: std.mem.Allocator, cases: []const ParserCase) void { + for (cases) |c| freeParserCase(alloc, c); + alloc.free(cases); +} + +fn deinitParserCaseList(alloc: std.mem.Allocator, list: *std.ArrayList(ParserCase)) void { + for (list.items) |c| freeParserCase(alloc, c); + list.deinit(alloc); +} + +fn transferOwnedParserCases(alloc: std.mem.Allocator, dst: *std.ArrayList(ParserCase), cases: []ParserCase) !void { + errdefer freeParserCases(alloc, cases); + try dst.appendSlice(alloc, cases); + alloc.free(cases); +} + +fn parseHtml5libDat(io: std.Io, alloc: std.mem.Allocator, path: []const u8) ![]ParserCase { + const text = try common.readFileAlloc(io, alloc, path); defer alloc.free(text); + return parseHtml5libDatText(alloc, text); +} + +fn parseHtml5libDatText(alloc: std.mem.Allocator, text: []const u8) ![]ParserCase { + var out = std.ArrayList(ParserCase).empty; + errdefer deinitParserCaseList(alloc, &out); var blocks = std.mem.splitSequence(u8, text, "\n#data\n"); while (blocks.next()) |raw_blk| { var blk = raw_blk; @@ -1831,9 +1949,10 @@ fn parseHtml5libDat(alloc: std.mem.Allocator, path: []const u8, out: *std.ArrayL html_in = html_in[0..err_idx]; } const html_copy = try alloc.dupe(u8, html_in); + errdefer alloc.free(html_copy); var expected = std.ArrayList([]const u8).empty; - errdefer expected.deinit(alloc); + errdefer deinitOwnedStringList(alloc, &expected); var lines = std.mem.splitScalar(u8, rest, '\n'); while (lines.next()) |line| { if (line.len < 3 or line[0] != '|') continue; @@ -1849,13 +1968,16 @@ fn parseHtml5libDat(alloc: std.mem.Allocator, path: []const u8, out: *std.ArrayL alloc.free(lower); continue; } - try expected.append(alloc, lower); + try appendOwnedString(alloc, &expected, lower); } + const expected_slice = try expected.toOwnedSlice(alloc); + errdefer freeOwnedStringSlice(alloc, expected_slice); try out.append(alloc, .{ .html = html_copy, - .expected = try expected.toOwnedSlice(alloc), + .expected = expected_slice, }); } + return out.toOwnedSlice(alloc); } fn fromHex(c: u8) !u8 { @@ -1896,7 +2018,7 @@ fn quoteEnd(text: []const u8, start: usize) ?usize { fn parseWptTreeExpected(alloc: std.mem.Allocator, decoded_tree: []const u8) ![]const []const u8 { var expected = std.ArrayList([]const u8).empty; - errdefer expected.deinit(alloc); + errdefer deinitOwnedStringList(alloc, &expected); var lines = std.mem.splitScalar(u8, decoded_tree, '\n'); while (lines.next()) |line| { @@ -1913,17 +2035,23 @@ fn parseWptTreeExpected(alloc: std.mem.Allocator, decoded_tree: []const u8) ![]c alloc.free(lower); continue; } - try expected.append(alloc, lower); + try appendOwnedString(alloc, &expected, lower); } return expected.toOwnedSlice(alloc); } -fn parseWptHtmlSuiteFile(alloc: std.mem.Allocator, path: []const u8, out: *std.ArrayList(ParserCase)) !void { - const text = try common.readFileAlloc(alloc, path); +fn parseWptHtmlSuiteFile(io: std.Io, alloc: std.mem.Allocator, path: []const u8) ![]ParserCase { + const text = try common.readFileAlloc(io, alloc, path); defer alloc.free(text); + return parseWptHtmlSuiteText(alloc, text); +} - if (std.mem.indexOf(u8, text, "var tests = {") == null) return; - if (std.mem.indexOf(u8, text, "init_tests(") == null) return; +fn parseWptHtmlSuiteText(alloc: std.mem.Allocator, text: []const u8) ![]ParserCase { + if (std.mem.indexOf(u8, text, "var tests = {") == null) return try alloc.alloc(ParserCase, 0); + if (std.mem.indexOf(u8, text, "init_tests(") == null) return try alloc.alloc(ParserCase, 0); + + var out = std.ArrayList(ParserCase).empty; + errdefer deinitParserCaseList(alloc, &out); var pos: usize = 0; while (std.mem.indexOfPos(u8, text, pos, "[async_test(")) |mark| { @@ -1939,7 +2067,7 @@ fn parseWptHtmlSuiteFile(alloc: std.mem.Allocator, path: []const u8, out: *std.A // [async_test(...), "", " ", " "] // This parser harness only validates full-document cases, so skip any // entry that carries additional args after expected tree string. - const tail = std.mem.trimLeft(u8, text[expected_end + 1 ..], " \t\r\n"); + const tail = std.mem.trimStart(u8, text[expected_end + 1 ..], " \t\r\n"); if (tail.len == 0) break; if (tail[0] == ',') continue; if (tail[0] != ']') continue; @@ -1963,6 +2091,7 @@ fn parseWptHtmlSuiteFile(alloc: std.mem.Allocator, path: []const u8, out: *std.A .expected = expected, }); } + return out.toOwnedSlice(alloc); } fn parseTagJsonArray(alloc: std.mem.Allocator, text: []const u8) ![]const []const u8 { @@ -1970,7 +2099,7 @@ fn parseTagJsonArray(alloc: std.mem.Allocator, text: []const u8) ![]const []cons defer parsed.deinit(); if (parsed.value != .array) return error.InvalidJson; var tags = std.ArrayList([]const u8).empty; - errdefer tags.deinit(alloc); + errdefer deinitOwnedStringList(alloc, &tags); for (parsed.value.array.items) |it| { if (it != .string) continue; const lower = try std.ascii.allocLowerString(alloc, it.string); @@ -1978,7 +2107,7 @@ fn parseTagJsonArray(alloc: std.mem.Allocator, text: []const u8) ![]const []cons alloc.free(lower); continue; } - try tags.append(alloc, lower); + try appendOwnedString(alloc, &tags, lower); } return tags.toOwnedSlice(alloc); } @@ -1991,7 +2120,7 @@ fn eqlStringSlices(a: []const []const u8, b: []const []const u8) bool { return true; } -fn runParserCases(alloc: std.mem.Allocator, mode: []const u8, cases: []const ParserCase, max_cases: usize) !ParserSuiteResult { +fn runParserCases(io: std.Io, alloc: std.mem.Allocator, mode: []const u8, cases: []const ParserCase, max_cases: usize) !ParserSuiteResult { const limit = @min(max_cases, cases.len); var passed: usize = 0; var examples = std.ArrayList([]const u8).empty; @@ -2001,12 +2130,12 @@ fn runParserCases(alloc: std.mem.Allocator, mode: []const u8, cases: []const Par var idx: usize = 0; while (idx < limit) : (idx += 1) { const c = cases[idx]; - const tmp = try tempHtmlFile(alloc, c.html); + const tmp = try tempHtmlFile(io, alloc, c.html); defer { - std.fs.deleteFileAbsolute(tmp) catch {}; + std.Io.Dir.deleteFileAbsolute(io, tmp) catch {}; alloc.free(tmp); } - const raw = runParseTagsFile(alloc, mode, tmp) catch { + const raw = runParseTagsFile(io, alloc, mode, tmp) catch { if (examples.items.len < 10) { const src = std.mem.replaceOwned(u8, alloc, c.html, "\n", "\\n") catch c.html; const msg = std.fmt.allocPrint(alloc, "{s} -> ", .{src}) catch "parse-error"; @@ -2015,7 +2144,7 @@ fn runParserCases(alloc: std.mem.Allocator, mode: []const u8, cases: []const Par const empty: []const []const u8 = &.{}; try failures.append(alloc, .{ .case_index = idx, - .input_preview = try htmlPreview(alloc, c.html), + .input_preview = try htmlPreview(io, alloc, c.html), .input_len = c.html.len, .expected = try dupeStringSlices(alloc, c.expected), .actual = empty, @@ -2041,7 +2170,7 @@ fn runParserCases(alloc: std.mem.Allocator, mode: []const u8, cases: []const Par } try failures.append(alloc, .{ .case_index = idx, - .input_preview = try htmlPreview(alloc, c.html), + .input_preview = try htmlPreview(io, alloc, c.html), .input_len = c.html.len, .expected = try dupeStringSlices(alloc, c.expected), .actual = try dupeStringSlices(alloc, got), @@ -2054,25 +2183,25 @@ fn runParserCases(alloc: std.mem.Allocator, mode: []const u8, cases: []const Par .summary = .{ .total = limit, .passed = passed, - .failed = limit - passed, .examples = try examples.toOwnedSlice(alloc), }, .failures = try failures.toOwnedSlice(alloc), }; } -fn runHtml5libParserSuite(alloc: std.mem.Allocator, mode: []const u8, max_cases: usize) !ParserSuiteResult { +fn runHtml5libParserSuite(io: std.Io, alloc: std.mem.Allocator, mode: []const u8, max_cases: usize) !ParserSuiteResult { const tc_dir = SUITES_DIR ++ "/html5lib-tests/tree-construction"; - var dir = try std.fs.cwd().openDir(tc_dir, .{ .iterate = true }); - defer dir.close(); + var dir = try std.Io.Dir.cwd().openDir(io, tc_dir, .{ .iterate = true }); + defer dir.close(io); var dat_names = std.ArrayList([]const u8).empty; - defer dat_names.deinit(alloc); + defer deinitOwnedStringList(alloc, &dat_names); var it = dir.iterate(); - while (try it.next()) |entry| { + while (try it.next(io)) |entry| { if (entry.kind != .file) continue; if (!std.mem.endsWith(u8, entry.name, ".dat")) continue; - try dat_names.append(alloc, try alloc.dupe(u8, entry.name)); + const name = try alloc.dupe(u8, entry.name); + try appendOwnedString(alloc, &dat_names, name); } std.mem.sort([]const u8, dat_names.items, {}, struct { fn lt(_: void, a: []const u8, b: []const u8) bool { @@ -2081,38 +2210,33 @@ fn runHtml5libParserSuite(alloc: std.mem.Allocator, mode: []const u8, max_cases: }.lt); var cases = std.ArrayList(ParserCase).empty; - defer { - for (cases.items) |c| { - alloc.free(c.html); - for (c.expected) |tag| alloc.free(tag); - alloc.free(c.expected); - } - cases.deinit(alloc); - } + defer deinitParserCaseList(alloc, &cases); for (dat_names.items) |name| { const path = try std.fmt.allocPrint(alloc, "{s}/{s}", .{ tc_dir, name }); defer alloc.free(path); - try parseHtml5libDat(alloc, path, &cases); + const parsed_cases = try parseHtml5libDat(io, alloc, path); + try transferOwnedParserCases(alloc, &cases, parsed_cases); } - return runParserCases(alloc, mode, cases.items, max_cases); + return runParserCases(io, alloc, mode, cases.items, max_cases); } -fn runWptParserSuite(alloc: std.mem.Allocator, mode: []const u8, max_cases: usize) !ParserSuiteResult { +fn runWptParserSuite(io: std.Io, alloc: std.mem.Allocator, mode: []const u8, max_cases: usize) !ParserSuiteResult { const wpt_dir = SUITES_DIR ++ "/wpt/html/syntax/parsing"; - var dir = try std.fs.cwd().openDir(wpt_dir, .{ .iterate = true }); - defer dir.close(); + var dir = try std.Io.Dir.cwd().openDir(io, wpt_dir, .{ .iterate = true }); + defer dir.close(io); var html_names = std.ArrayList([]const u8).empty; - defer html_names.deinit(alloc); + defer deinitOwnedStringList(alloc, &html_names); var walker = try dir.walk(alloc); defer walker.deinit(); - while (try walker.next()) |entry| { + while (try walker.next(io)) |entry| { if (entry.kind != .file) continue; if (!std.mem.endsWith(u8, entry.path, ".html")) continue; const base = std.fs.path.basename(entry.path); if (!std.mem.startsWith(u8, base, "html5lib_")) continue; - try html_names.append(alloc, try alloc.dupe(u8, entry.path)); + const path_copy = try alloc.dupe(u8, entry.path); + try appendOwnedString(alloc, &html_names, path_copy); } std.mem.sort([]const u8, html_names.items, {}, struct { fn lt(_: void, a: []const u8, b: []const u8) bool { @@ -2121,19 +2245,13 @@ fn runWptParserSuite(alloc: std.mem.Allocator, mode: []const u8, max_cases: usiz }.lt); var cases = std.ArrayList(ParserCase).empty; - defer { - for (cases.items) |c| { - alloc.free(c.html); - for (c.expected) |tag| alloc.free(tag); - alloc.free(c.expected); - } - cases.deinit(alloc); - } + defer deinitParserCaseList(alloc, &cases); for (html_names.items) |name| { const path = try std.fmt.allocPrint(alloc, "{s}/{s}", .{ wpt_dir, name }); defer alloc.free(path); - try parseWptHtmlSuiteFile(alloc, path, &cases); + const parsed_cases = try parseWptHtmlSuiteFile(io, alloc, path); + try transferOwnedParserCases(alloc, &cases, parsed_cases); } if (cases.items.len == 0 and html_names.items.len != 0) { @@ -2167,17 +2285,16 @@ fn runWptParserSuite(alloc: std.mem.Allocator, mode: []const u8, max_cases: usiz .summary = .{ .total = total, .passed = 0, - .failed = total, .examples = try examples.toOwnedSlice(alloc), }, .failures = try failures.toOwnedSlice(alloc), }; } - return runParserCases(alloc, mode, cases.items, max_cases); + return runParserCases(io, alloc, mode, cases.items, max_cases); } -fn runExternalSuites(alloc: std.mem.Allocator, args: []const []const u8) !void { +fn runExternalSuites(io: std.Io, alloc: std.mem.Allocator, args: []const [:0]const u8) !void { var mode_arg: []const u8 = "both"; var max_cases: usize = 600; var max_whatwg_cases: usize = 500; @@ -2210,9 +2327,9 @@ fn runExternalSuites(alloc: std.mem.Allocator, args: []const []const u8) !void { } else return error.InvalidArgument; } - try ensureSuites(alloc); - try buildSuiteRunner(alloc); - try common.ensureDir(RESULTS_DIR); + try ensureSuites(io, alloc); + try buildSuiteRunner(io, alloc); + try common.ensureDir(io, RESULTS_DIR); const modes = if (std.mem.eql(u8, mode_arg, "both")) &[_][]const u8{ "strictest", "fastest" } else &[_][]const u8{mode_arg}; var mode_reports = std.ArrayList(struct { @@ -2229,9 +2346,9 @@ fn runExternalSuites(alloc: std.mem.Allocator, args: []const []const u8) !void { defer mode_reports.deinit(alloc); for (modes) |mode| { - const sel = try runSelectorSuites(alloc, mode); - const parser_html5lib = try runHtml5libParserSuite(alloc, mode, max_cases); - const parser_whatwg = try runWptParserSuite(alloc, mode, max_whatwg_cases); + const sel = try runSelectorSuites(io, alloc, mode); + const parser_html5lib = try runHtml5libParserSuite(io, alloc, mode, max_cases); + const parser_whatwg = try runWptParserSuite(io, alloc, mode, max_whatwg_cases); try mode_reports.append(alloc, .{ .mode = mode, .nw = sel.nw, @@ -2246,24 +2363,24 @@ fn runExternalSuites(alloc: std.mem.Allocator, args: []const []const u8) !void { std.debug.print("Mode: {s}\n", .{mode}); std.debug.print(" Selector suites:\n", .{}); - std.debug.print(" nwmatcher: {d}/{d} passed ({d} failed)\n", .{ sel.nw.passed, sel.nw.total, sel.nw.failed }); - std.debug.print(" qwery_contextual: {d}/{d} passed ({d} failed)\n", .{ sel.qw.passed, sel.qw.total, sel.qw.failed }); + std.debug.print(" nwmatcher: {d}/{d} passed ({d} failed)\n", .{ sel.nw.passed, sel.nw.total, failedCount(sel.nw) }); + std.debug.print(" qwery_contextual: {d}/{d} passed ({d} failed)\n", .{ sel.qw.passed, sel.qw.total, failedCount(sel.qw) }); std.debug.print(" Parser suites:\n", .{}); std.debug.print(" html5lib tree-construction subset: {d}/{d} passed ({d} failed)\n", .{ parser_html5lib.summary.passed, parser_html5lib.summary.total, - parser_html5lib.summary.failed, + failedCount(parser_html5lib.summary), }); std.debug.print(" WHATWG HTML parsing (WPT html5lib_* corpus): {d}/{d} passed ({d} failed)\n", .{ parser_whatwg.summary.passed, parser_whatwg.summary.total, - parser_whatwg.summary.failed, + failedCount(parser_whatwg.summary), }); } - var json_buf = std.ArrayList(u8).empty; - defer json_buf.deinit(alloc); - const jw = json_buf.writer(alloc); + var json_buf: std.Io.Writer.Allocating = .init(alloc); + defer json_buf.deinit(); + const jw = &json_buf.writer; try jw.writeAll("{\"modes\":{"); for (mode_reports.items, 0..) |mr, idx_mode| { if (idx_mode != 0) try jw.writeAll(","); @@ -2271,23 +2388,23 @@ fn runExternalSuites(alloc: std.mem.Allocator, args: []const []const u8) !void { try jw.print("\"selector_suites\":{{\"nwmatcher\":{{\"total\":{d},\"passed\":{d},\"failed\":{d}}},\"qwery_contextual\":{{\"total\":{d},\"passed\":{d},\"failed\":{d}}}}},", .{ mr.nw.total, mr.nw.passed, - mr.nw.failed, + failedCount(mr.nw), mr.qw.total, mr.qw.passed, - mr.qw.failed, + failedCount(mr.qw), }); try jw.print("\"parser_suites\":{{\"html5lib_subset\":{{\"total\":{d},\"passed\":{d},\"failed\":{d}}},\"whatwg_html_parsing\":{{\"total\":{d},\"passed\":{d},\"failed\":{d}}}}}", .{ mr.parser_html5lib.total, mr.parser_html5lib.passed, - mr.parser_html5lib.failed, + failedCount(mr.parser_html5lib), mr.parser_whatwg.total, mr.parser_whatwg.passed, - mr.parser_whatwg.failed, + failedCount(mr.parser_whatwg), }); try jw.writeAll("}"); } try jw.writeAll("}}"); - try common.writeFile(json_out, json_buf.items); + try common.writeFile(io, json_out, json_buf.written()); std.debug.print("Wrote report: {s}\n", .{json_out}); var failure_modes = std.ArrayList(ModeFailuresOut).empty; @@ -2309,18 +2426,18 @@ fn runExternalSuites(alloc: std.mem.Allocator, args: []const []const u8) !void { const failure_json_out: ExternalFailuresOut = .{ .modes = failure_modes.items, }; - var failure_json_writer: std.io.Writer.Allocating = .init(alloc); + var failure_json_writer: std.Io.Writer.Allocating = .init(alloc); defer failure_json_writer.deinit(); var failure_json_stream: std.json.Stringify = .{ .writer = &failure_json_writer.writer, .options = .{ .whitespace = .indent_2 }, }; try failure_json_stream.write(failure_json_out); - try common.writeFile(failures_out, failure_json_writer.written()); + try common.writeFile(io, failures_out, failure_json_writer.written()); std.debug.print("Wrote failures: {s}\n", .{failures_out}); if (std.mem.eql(u8, json_out, "bench/results/external_suite_report.json")) { - try updateReadmeAutoSummary(alloc); + try updateReadmeAutoSummary(io, alloc); } } @@ -2328,9 +2445,9 @@ fn cmpStringSlice(_: void, a: []const u8, b: []const u8) bool { return std.mem.lessThan(u8, a, b); } -fn collectMarkdownFiles(alloc: std.mem.Allocator) ![][]const u8 { +fn collectMarkdownFiles(io: std.Io, alloc: std.mem.Allocator) ![][]const u8 { var files = std.ArrayList([]const u8).empty; - errdefer files.deinit(alloc); + errdefer deinitOwnedStringList(alloc, &files); const root_docs = [_][]const u8{ "README.md", @@ -2341,21 +2458,22 @@ fn collectMarkdownFiles(alloc: std.mem.Allocator) ![][]const u8 { "bench/README.md", }; for (root_docs) |p| { - if (common.fileExists(p)) { - try files.append(alloc, try alloc.dupe(u8, p)); + if (common.fileExists(io, p)) { + const path_copy = try alloc.dupe(u8, p); + try appendOwnedString(alloc, &files, path_copy); } } - if (common.fileExists("docs")) { - var docs_dir = try std.fs.cwd().openDir("docs", .{ .iterate = true }); - defer docs_dir.close(); + if (common.fileExists(io, "docs")) { + var docs_dir = try std.Io.Dir.cwd().openDir(io, "docs", .{ .iterate = true }); + defer docs_dir.close(io); var walker = try docs_dir.walk(alloc); defer walker.deinit(); - while (try walker.next()) |entry| { + while (try walker.next(io)) |entry| { if (entry.kind != .file) continue; if (!std.mem.endsWith(u8, entry.path, ".md")) continue; const joined = try std.fs.path.join(alloc, &[_][]const u8{ "docs", entry.path }); - try files.append(alloc, joined); + try appendOwnedString(alloc, &files, joined); } } @@ -2363,30 +2481,31 @@ fn collectMarkdownFiles(alloc: std.mem.Allocator) ![][]const u8 { return files.toOwnedSlice(alloc); } -fn collectExampleFiles(alloc: std.mem.Allocator) ![][]const u8 { +fn collectExampleFiles(io: std.Io, alloc: std.mem.Allocator) ![][]const u8 { var files = std.ArrayList([]const u8).empty; - errdefer files.deinit(alloc); + errdefer deinitOwnedStringList(alloc, &files); - var examples_dir = try std.fs.cwd().openDir("examples", .{ .iterate = true }); - defer examples_dir.close(); + var examples_dir = try std.Io.Dir.cwd().openDir(io, "examples", .{ .iterate = true }); + defer examples_dir.close(io); var walker = try examples_dir.walk(alloc); defer walker.deinit(); - while (try walker.next()) |entry| { + while (try walker.next(io)) |entry| { if (entry.kind != .file) continue; if (!std.mem.endsWith(u8, entry.path, ".zig")) continue; const joined = try std.fs.path.join(alloc, &[_][]const u8{ "examples", entry.path }); - try files.append(alloc, joined); + try appendOwnedString(alloc, &files, joined); } std.mem.sort([]const u8, files.items, {}, cmpStringSlice); return files.toOwnedSlice(alloc); } -fn loadBuildStepSet(alloc: std.mem.Allocator) !std.StringHashMap(void) { - const out = try common.runCaptureStdout(alloc, &[_][]const u8{ "zig", "build", "--list-steps" }, REPO_ROOT); +fn loadBuildStepSet(io: std.Io, alloc: std.mem.Allocator) !std.StringHashMap(void) { + const out = try common.runCaptureStdout(io, alloc, &[_][]const u8{ "zig", "build", "--list-steps" }, REPO_ROOT); defer alloc.free(out); var set = std.StringHashMap(void).init(alloc); + errdefer deinitOwnedStringSet(alloc, &set); var lines = std.mem.splitScalar(u8, out, '\n'); while (lines.next()) |line_raw| { const line = std.mem.trim(u8, line_raw, " \t\r"); @@ -2394,76 +2513,67 @@ fn loadBuildStepSet(alloc: std.mem.Allocator) !std.StringHashMap(void) { const first_ws = std.mem.indexOfAny(u8, line, " \t") orelse line.len; const step = line[0..first_ws]; if (step.len == 0) continue; - try set.put(try alloc.dupe(u8, step), {}); + const step_copy = try alloc.dupe(u8, step); + try putOwnedString(alloc, &set, step_copy); } return set; } -fn trimMarkdownLinkTarget(raw: []const u8) []const u8 { - var target = std.mem.trim(u8, raw, " \t\r"); +fn validateMarkdownLink(io: std.Io, alloc: std.mem.Allocator, md_path: []const u8, line_no: usize, target_raw: []const u8) !bool { + var target = std.mem.trim(u8, target_raw, " \t\r"); if (target.len >= 2 and target[0] == '<' and target[target.len - 1] == '>') { target = target[1 .. target.len - 1]; } - if (target.len == 0) return target; + if (target.len == 0) return true; if (target[0] != '<') { const ws_idx = std.mem.indexOfAny(u8, target, " \t\r") orelse target.len; target = target[0..ws_idx]; } - return target; -} - -fn sliceBeforeFirstAny(haystack: []const u8, chars: []const u8) []const u8 { - const idx = std.mem.indexOfAny(u8, haystack, chars) orelse haystack.len; - return haystack[0..idx]; -} - -fn isRemoteLink(target: []const u8) bool { - if (std.mem.startsWith(u8, target, "http://")) return true; - if (std.mem.startsWith(u8, target, "https://")) return true; - if (std.mem.startsWith(u8, target, "mailto:")) return true; - if (std.mem.startsWith(u8, target, "tel:")) return true; - return std.mem.indexOf(u8, target, "://") != null; -} - -fn validateMarkdownLink(alloc: std.mem.Allocator, md_path: []const u8, line_no: usize, target_raw: []const u8, ok: *bool) !void { - const target = trimMarkdownLinkTarget(target_raw); - if (target.len == 0) return; - if (target[0] == '#') return; - if (isRemoteLink(target)) return; + if (target.len == 0) return true; + if (target[0] == '#') return true; + if (std.mem.startsWith(u8, target, "http://") or + std.mem.startsWith(u8, target, "https://") or + std.mem.startsWith(u8, target, "mailto:") or + std.mem.startsWith(u8, target, "tel:") or + std.mem.indexOf(u8, target, "://") != null) + { + return true; + } - const path_only = sliceBeforeFirstAny(target, "#?"); - if (path_only.len == 0) return; + const path_end = std.mem.indexOfAny(u8, target, "#?") orelse target.len; + const path_only = target[0..path_end]; + if (path_only.len == 0) return true; if (std.mem.startsWith(u8, path_only, "/")) { std.debug.print("docs-check: {s}:{d}: absolute markdown path is not allowed: {s}\n", .{ md_path, line_no, target }); - ok.* = false; - return; + return false; } const base_dir = std.fs.path.dirname(md_path) orelse "."; const resolved = try std.fs.path.join(alloc, &[_][]const u8{ base_dir, path_only }); defer alloc.free(resolved); - if (common.fileExists(resolved)) return; + if (common.fileExists(io, resolved)) return true; if (std.mem.endsWith(u8, path_only, "/")) { const with_readme = try std.fs.path.join(alloc, &[_][]const u8{ resolved, "README.md" }); defer alloc.free(with_readme); - if (common.fileExists(with_readme)) return; + if (common.fileExists(io, with_readme)) return true; } std.debug.print("docs-check: {s}:{d}: unresolved markdown link: {s}\n", .{ md_path, line_no, target }); - ok.* = false; + return false; } -fn checkMarkdownLinks(alloc: std.mem.Allocator, md_path: []const u8, content: []const u8, ok: *bool) !void { +fn checkMarkdownLinks(io: std.Io, alloc: std.mem.Allocator, md_path: []const u8, content: []const u8) !bool { + var ok = true; var in_fence = false; var line_no: usize = 0; var lines = std.mem.splitScalar(u8, content, '\n'); while (lines.next()) |line_raw| { line_no += 1; - const line = std.mem.trimRight(u8, line_raw, "\r"); - const trimmed = std.mem.trimLeft(u8, line, " \t"); + const line = std.mem.trimEnd(u8, line_raw, "\r"); + const trimmed = std.mem.trimStart(u8, line, " \t"); if (std.mem.startsWith(u8, trimmed, "```")) { in_fence = !in_fence; continue; @@ -2485,20 +2595,22 @@ fn checkMarkdownLinks(alloc: std.mem.Allocator, md_path: []const u8, content: [] i = close + 2; continue; }; - try validateMarkdownLink(alloc, md_path, line_no, line[close + 2 .. end], ok); + ok = (try validateMarkdownLink(io, alloc, md_path, line_no, line[close + 2 .. end])) and ok; i = end + 1; } } + return ok; } -fn checkLocalAbsolutePaths(md_path: []const u8, content: []const u8, ok: *bool) void { +fn checkLocalAbsolutePaths(md_path: []const u8, content: []const u8) bool { if (std.mem.indexOf(u8, content, "/home/") != null or std.mem.indexOf(u8, content, "/Users/") != null or std.mem.indexOf(u8, content, "C:\\Users\\") != null) { std.debug.print("docs-check: {s}: contains machine-local absolute path\n", .{md_path}); - ok.* = false; + return false; } + return true; } fn parseStepAfterBuild(content: []const u8, start: usize) ?[]const u8 { @@ -2518,7 +2630,8 @@ fn parseStepAfterBuild(content: []const u8, start: usize) ?[]const u8 { return null; } -fn checkDocumentedBuildCommands(md_path: []const u8, content: []const u8, step_set: *const std.StringHashMap(void), ok: *bool) void { +fn checkDocumentedBuildCommands(md_path: []const u8, content: []const u8, step_set: std.StringHashMap(void)) bool { + var ok = true; var pos: usize = 0; while (std.mem.indexOfPos(u8, content, pos, "zig build")) |found| { if (found > 0 and !std.ascii.isWhitespace(content[found - 1]) and content[found - 1] != '`') { @@ -2538,18 +2651,18 @@ fn checkDocumentedBuildCommands(md_path: []const u8, content: []const u8, step_s }; if (!step_set.contains(step)) { std.debug.print("docs-check: {s}: references unknown zig build step '{s}'\n", .{ md_path, step }); - ok.* = false; + ok = false; } pos = found + 1; } + return ok; } -fn checkChangelogCompatibilityLabels(content: []const u8, ok: *bool) void { +fn checkChangelogCompatibilityLabels(content: []const u8) bool { const header = "## [Unreleased]"; const start = std.mem.indexOf(u8, content, header) orelse { std.debug.print("docs-check: CHANGELOG.md: missing '## [Unreleased]' section\n", .{}); - ok.* = false; - return; + return false; }; const after = content[start + header.len ..]; @@ -2561,32 +2674,37 @@ fn checkChangelogCompatibilityLabels(content: []const u8, ok: *bool) void { "Migration:", "Downstream scope:", }; + var ok = true; for (required) |needle| { if (std.mem.indexOf(u8, section, needle) == null) { std.debug.print("docs-check: CHANGELOG.md: Unreleased section missing compatibility label '{s}'\n", .{needle}); - ok.* = false; + ok = false; } } + return ok; } -fn runDocsCheck(alloc: std.mem.Allocator) !void { - const markdown_files = try collectMarkdownFiles(alloc); - defer alloc.free(markdown_files); - var step_set = try loadBuildStepSet(alloc); - defer step_set.deinit(); +fn runDocsCheck(io: std.Io, alloc: std.mem.Allocator) !void { + const markdown_files = try collectMarkdownFiles(io, alloc); + defer { + for (markdown_files) |path| alloc.free(path); + alloc.free(markdown_files); + } + var step_set = try loadBuildStepSet(io, alloc); + defer deinitOwnedStringSet(alloc, &step_set); var ok = true; var checked: usize = 0; for (markdown_files) |md_path| { - const content = try common.readFileAlloc(alloc, md_path); + const content = try common.readFileAlloc(io, alloc, md_path); defer alloc.free(content); checked += 1; - checkLocalAbsolutePaths(md_path, content, &ok); - try checkMarkdownLinks(alloc, md_path, content, &ok); - checkDocumentedBuildCommands(md_path, content, &step_set, &ok); + ok = checkLocalAbsolutePaths(md_path, content) and ok; + ok = (try checkMarkdownLinks(io, alloc, md_path, content)) and ok; + ok = checkDocumentedBuildCommands(md_path, content, step_set) and ok; if (std.mem.eql(u8, md_path, "CHANGELOG.md")) { - checkChangelogCompatibilityLabels(content, &ok); + ok = checkChangelogCompatibilityLabels(content) and ok; } } @@ -2594,14 +2712,18 @@ fn runDocsCheck(alloc: std.mem.Allocator) !void { std.debug.print("docs-check: OK ({d} markdown files)\n", .{checked}); } -fn runExamplesCheck(alloc: std.mem.Allocator) !void { - const example_files = try collectExampleFiles(alloc); - defer alloc.free(example_files); +fn runExamplesCheck(io: std.Io, alloc: std.mem.Allocator) !void { + const example_files = try collectExampleFiles(io, alloc); + defer { + for (example_files) |path| alloc.free(path); + alloc.free(example_files); + } if (example_files.len == 0) return error.NoExamplesFound; for (example_files) |example_path| { std.debug.print("examples-check: zig test {s}\n", .{example_path}); const root_mod = try std.fmt.allocPrint(alloc, "-Mroot={s}", .{example_path}); + defer alloc.free(root_mod); const html_mod = "-Mhtmlparser=src/root.zig"; const argv = [_][]const u8{ "zig", @@ -2611,7 +2733,7 @@ fn runExamplesCheck(alloc: std.mem.Allocator) !void { root_mod, html_mod, }; - try common.runInherit(alloc, &argv, REPO_ROOT); + try common.runInherit(io, alloc, &argv, REPO_ROOT); } std.debug.print("examples-check: OK ({d} examples)\n", .{example_files.len}); } @@ -2631,13 +2753,10 @@ fn usage() void { } /// CLI entrypoint for repository maintenance, benchmarking, and conformance tasks. -pub fn main() !void { - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena.deinit(); - const alloc = arena.allocator(); - - const args = try std.process.argsAlloc(alloc); - defer std.process.argsFree(alloc, args); +pub fn main(init: std.process.Init) !void { + const io = init.io; + const alloc = init.gpa; + const args = try init.minimal.args.toSlice(init.arena.allocator()); if (args.len < 2) { usage(); return; @@ -2646,7 +2765,7 @@ pub fn main() !void { const rest = args[2..]; if (std.mem.eql(u8, cmd, "setup-parsers")) { - try setupParsers(alloc); + try setupParsers(io, alloc); return; } if (std.mem.eql(u8, cmd, "setup-fixtures")) { @@ -2656,32 +2775,160 @@ pub fn main() !void { refresh = true; } else return error.InvalidArgument; } - try setupFixtures(alloc, refresh); + try setupFixtures(io, alloc, refresh); return; } if (std.mem.eql(u8, cmd, "run-benchmarks")) { - try runBenchmarks(alloc, rest); + try runBenchmarks(io, alloc, rest); return; } if (std.mem.eql(u8, cmd, "sync-docs-bench")) { if (rest.len != 0) return error.InvalidArgument; - try updateDocumentationBenchmarkSnapshot(alloc); - try updateReadmeAutoSummary(alloc); + try updateDocumentationBenchmarkSnapshot(io, alloc); + try updateReadmeAutoSummary(io, alloc); return; } if (std.mem.eql(u8, cmd, "run-external-suites")) { - try runExternalSuites(alloc, rest); + try runExternalSuites(io, alloc, rest); return; } if (std.mem.eql(u8, cmd, "docs-check")) { - try runDocsCheck(alloc); + try runDocsCheck(io, alloc); return; } if (std.mem.eql(u8, cmd, "examples-check")) { - try runExamplesCheck(alloc); + try runExamplesCheck(io, alloc); return; } usage(); return error.InvalidCommand; } + +test "bench cleanup frees sample buffers" { + const alloc = std.testing.allocator; + + var empty_parse: [0]ParseResult = .{}; + freeParseSamples(alloc, &empty_parse); + + const parse_samples = try alloc.alloc(u64, 2); + parse_samples[0] = 1; + parse_samples[1] = 2; + var parse_rows = [_]ParseResult{.{ + .parser = "ours", + .fixture = "fixture.html", + .iterations = 1, + .samples_ns = parse_samples, + .median_ns = 1, + .throughput_mb_s = 1.0, + }}; + freeParseSamples(alloc, &parse_rows); + + const query_samples = try alloc.alloc(u64, 1); + query_samples[0] = 3; + var query_rows = [_]QueryResult{.{ + .parser = "ours", + .case = "case", + .selector = "div", + .fixture = null, + .iterations = 1, + .samples_ns = query_samples, + .median_ns = 1, + .ops_s = 1.0, + .ns_per_op = 1.0, + }}; + freeQuerySamples(alloc, &query_rows); + + var empty_query: [0]QueryResult = .{}; + freeQuerySamples(alloc, &empty_query); +} + +test "owned string list cleanup frees entries" { + const alloc = std.testing.allocator; + var empty = std.ArrayList([]const u8).empty; + deinitOwnedStringList(alloc, &empty); + var list = std.ArrayList([]const u8).empty; + const one = try alloc.dupe(u8, "one"); + try appendOwnedString(alloc, &list, one); + const two = try alloc.dupe(u8, "two"); + try appendOwnedString(alloc, &list, two); + deinitOwnedStringList(alloc, &list); +} + +test "owned string set cleanup frees keys" { + const alloc = std.testing.allocator; + var empty = std.StringHashMap(void).init(alloc); + deinitOwnedStringSet(alloc, &empty); + + var set = std.StringHashMap(void).init(alloc); + const docs_check = try alloc.dupe(u8, "docs-check"); + try putOwnedString(alloc, &set, docs_check); + const examples_check = try alloc.dupe(u8, "examples-check"); + try putOwnedString(alloc, &set, examples_check); + deinitOwnedStringSet(alloc, &set); +} + +test "parser case transfer frees nested ownership on append failure" { + try std.testing.checkAllAllocationFailures(std.testing.allocator, struct { + fn makeOneCase(alloc: std.mem.Allocator) ![]ParserCase { + const html = try alloc.dupe(u8, ""); + errdefer alloc.free(html); + + var expected = std.ArrayList([]const u8).empty; + errdefer deinitOwnedStringList(alloc, &expected); + const div = try alloc.dupe(u8, "div"); + try appendOwnedString(alloc, &expected, div); + + const expected_slice = try expected.toOwnedSlice(alloc); + errdefer freeOwnedStringSlice(alloc, expected_slice); + + const cases = try alloc.alloc(ParserCase, 1); + errdefer alloc.free(cases); + cases[0] = .{ + .html = html, + .expected = expected_slice, + }; + return cases; + } + + fn run(alloc: std.mem.Allocator) !void { + var dst = std.ArrayList(ParserCase).empty; + defer deinitParserCaseList(alloc, &dst); + + const cases = try makeOneCase(alloc); + try transferOwnedParserCases(alloc, &dst, cases); + } + }.run, .{}); +} + +test "html5lib dat parser frees nested allocations on allocator failure" { + const sample = + "#data\n" ++ + "\n" ++ + "#document\n" ++ + "| \n" ++ + "| \n" ++ + "| \n" ++ + "| \n"; + + try std.testing.checkAllAllocationFailures(std.testing.allocator, struct { + fn run(alloc: std.mem.Allocator, text: []const u8) !void { + const cases = try parseHtml5libDatText(alloc, text); + defer freeParserCases(alloc, cases); + } + }.run, .{sample}); +} + +test "wpt html suite parser frees nested allocations on allocator failure" { + const sample = + "var tests = {};\n" ++ + "init_tests();\n" ++ + "[async_test('case'), \"%3Cdiv%3E%3C/div%3E\", \"| \\n| \\n| \\n|\"]\n"; + + try std.testing.checkAllAllocationFailures(std.testing.allocator, struct { + fn run(alloc: std.mem.Allocator, text: []const u8) !void { + const cases = try parseWptHtmlSuiteText(alloc, text); + defer freeParserCases(alloc, cases); + } + }.run, .{sample}); +} diff --git a/tools/suite_runner.zig b/tools/suite_runner.zig index 61ec62d..525b196 100644 --- a/tools/suite_runner.zig +++ b/tools/suite_runner.zig @@ -2,27 +2,34 @@ const std = @import("std"); const html = @import("htmlparser"); const default_options: html.ParseOptions = .{}; const Document = default_options.GetDocument(); +const parse_mode = @import("parse_mode.zig"); +const ParseMode = parse_mode.ParseMode; -const ParseMode = enum { - strictest, - fastest, +const ParsedFixture = struct { + doc: Document, + working: []u8, + + fn deinit(self: *ParsedFixture, alloc: std.mem.Allocator) void { + self.doc.deinit(); + alloc.free(self.working); + } }; -fn parseMode(s: []const u8) ?ParseMode { - if (std.mem.eql(u8, s, "strictest")) return .strictest; - if (std.mem.eql(u8, s, "fastest")) return .fastest; - return null; -} +fn parseFixtureDoc(io: std.Io, alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8) !ParsedFixture { + const input = try std.Io.Dir.cwd().readFileAlloc(io, fixture_path, alloc, .unlimited); + defer alloc.free(input); + + const working = try alloc.dupe(u8, input); + errdefer alloc.free(working); + + var doc = Document.init(alloc); + errdefer doc.deinit(); -fn parseDoc(noalias doc: *Document, input: []u8, mode: ParseMode) !void { switch (mode) { - .strictest => try doc.parse(input, .{ - .drop_whitespace_text_nodes = false, - }), - .fastest => try doc.parse(input, .{ - .drop_whitespace_text_nodes = true, - }), + .strictest => try doc.parse(working, .{ .drop_whitespace_text_nodes = false }), + .fastest => try doc.parse(working, .{ .drop_whitespace_text_nodes = true }), } + return .{ .doc = doc, .working = working }; } fn jsonEscape(writer: anytype, s: []const u8) !void { @@ -55,106 +62,78 @@ fn printJsonStringArray(writer: anytype, items: []const []const u8) !void { try writer.writeByte(']'); } -fn runSelectorIds(alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8, selector: []const u8) !void { - const input = try std.fs.cwd().readFileAlloc(alloc, fixture_path, std.math.maxInt(usize)); - defer alloc.free(input); - - const working = try alloc.dupe(u8, input); - defer alloc.free(working); - - var doc = Document.init(alloc); - defer doc.deinit(); - try parseDoc(&doc, working, mode); +fn runSelectorIds(io: std.Io, alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8, selector: []const u8) !void { + var parsed = try parseFixtureDoc(io, alloc, mode, fixture_path); + defer parsed.deinit(alloc); var out_ids = std.ArrayList([]const u8).empty; defer out_ids.deinit(alloc); - var it = try doc.queryAllRuntime(selector); + var it = try parsed.doc.queryAllRuntime(selector); while (it.next()) |node| { if (node.getAttributeValue("id")) |id| { try out_ids.append(alloc, id); } } - var out_buf = std.ArrayList(u8).empty; - defer out_buf.deinit(alloc); - try printJsonStringArray(out_buf.writer(alloc), out_ids.items); - try out_buf.append(alloc, '\n'); - try std.fs.File.stdout().writeAll(out_buf.items); + var out_buf: std.Io.Writer.Allocating = .init(alloc); + defer out_buf.deinit(); + try printJsonStringArray(&out_buf.writer, out_ids.items); + try out_buf.writer.writeByte('\n'); + try std.Io.File.stdout().writeStreamingAll(io, out_buf.written()); } -fn runSelectorCount(alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8, selector: []const u8) !void { - const input = try std.fs.cwd().readFileAlloc(alloc, fixture_path, std.math.maxInt(usize)); - defer alloc.free(input); - - const working = try alloc.dupe(u8, input); - defer alloc.free(working); - - var doc = Document.init(alloc); - defer doc.deinit(); - try parseDoc(&doc, working, mode); +fn runSelectorCount(io: std.Io, alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8, selector: []const u8) !void { + var parsed = try parseFixtureDoc(io, alloc, mode, fixture_path); + defer parsed.deinit(alloc); var count: usize = 0; - var it = try doc.queryAllRuntime(selector); + var it = try parsed.doc.queryAllRuntime(selector); while (it.next()) |_| { count += 1; } - var out_buf = std.ArrayList(u8).empty; - defer out_buf.deinit(alloc); - try out_buf.writer(alloc).print("{d}\n", .{count}); - try std.fs.File.stdout().writeAll(out_buf.items); + var out_buf: std.Io.Writer.Allocating = .init(alloc); + defer out_buf.deinit(); + try out_buf.writer.print("{d}\n", .{count}); + try std.Io.File.stdout().writeStreamingAll(io, out_buf.written()); } -fn runSelectorCountScopeTag(alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8, scope_tag: []const u8, selector: []const u8) !void { - const input = try std.fs.cwd().readFileAlloc(alloc, fixture_path, std.math.maxInt(usize)); - defer alloc.free(input); - - const working = try alloc.dupe(u8, input); - defer alloc.free(working); - - var doc = Document.init(alloc); - defer doc.deinit(); - try parseDoc(&doc, working, mode); +fn runSelectorCountScopeTag(io: std.Io, alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8, scope_tag: []const u8, selector: []const u8) !void { + var parsed = try parseFixtureDoc(io, alloc, mode, fixture_path); + defer parsed.deinit(alloc); var count: usize = 0; - if (doc.findFirstTag(scope_tag)) |scope| { + if (parsed.doc.findFirstTag(scope_tag)) |scope| { var it = try scope.queryAllRuntime(selector); while (it.next()) |_| { count += 1; } } - var out_buf = std.ArrayList(u8).empty; - defer out_buf.deinit(alloc); - try out_buf.writer(alloc).print("{d}\n", .{count}); - try std.fs.File.stdout().writeAll(out_buf.items); + var out_buf: std.Io.Writer.Allocating = .init(alloc); + defer out_buf.deinit(); + try out_buf.writer.print("{d}\n", .{count}); + try std.Io.File.stdout().writeStreamingAll(io, out_buf.written()); } -fn runParseTagsFile(alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8) !void { - const input = try std.fs.cwd().readFileAlloc(alloc, fixture_path, std.math.maxInt(usize)); - defer alloc.free(input); - - const working = try alloc.dupe(u8, input); - defer alloc.free(working); - - var doc = Document.init(alloc); - defer doc.deinit(); - try parseDoc(&doc, working, mode); +fn runParseTagsFile(io: std.Io, alloc: std.mem.Allocator, mode: ParseMode, fixture_path: []const u8) !void { + var parsed = try parseFixtureDoc(io, alloc, mode, fixture_path); + defer parsed.deinit(alloc); var tags = std.ArrayList([]const u8).empty; defer tags.deinit(alloc); - for (doc.nodes.items) |*n| { + for (parsed.doc.nodes.items) |*n| { if (n.kind != .element) continue; - try tags.append(alloc, n.name_or_text.slice(doc.source)); + try tags.append(alloc, n.name_or_text.slice(parsed.doc.source)); } - var out_buf = std.ArrayList(u8).empty; - defer out_buf.deinit(alloc); - try printJsonStringArray(out_buf.writer(alloc), tags.items); - try out_buf.append(alloc, '\n'); - try std.fs.File.stdout().writeAll(out_buf.items); + var out_buf: std.Io.Writer.Allocating = .init(alloc); + defer out_buf.deinit(); + try printJsonStringArray(&out_buf.writer, tags.items); + try out_buf.writer.writeByte('\n'); + try std.Io.File.stdout().writeStreamingAll(io, out_buf.written()); } fn usage() noreturn { @@ -166,41 +145,38 @@ fn usage() noreturn { } /// CLI entrypoint used by external-suite tooling to execute selector/parser probes. -pub fn main() !void { - var gpa = std.heap.GeneralPurposeAllocator(.{}){}; - defer _ = gpa.deinit(); - const alloc = gpa.allocator(); - - const args = try std.process.argsAlloc(alloc); - defer std.process.argsFree(alloc, args); +pub fn main(init: std.process.Init) !void { + const alloc = init.gpa; + const io = init.io; + const args = try init.minimal.args.toSlice(init.arena.allocator()); if (args.len < 2) usage(); if (std.mem.eql(u8, args[1], "selector-ids")) { if (args.len != 5) usage(); - const mode = parseMode(args[2]) orelse usage(); - try runSelectorIds(alloc, mode, args[3], args[4]); + const mode = parse_mode.parseMode(args[2]) orelse usage(); + try runSelectorIds(io, alloc, mode, args[3], args[4]); return; } if (std.mem.eql(u8, args[1], "selector-count")) { if (args.len != 5) usage(); - const mode = parseMode(args[2]) orelse usage(); - try runSelectorCount(alloc, mode, args[3], args[4]); + const mode = parse_mode.parseMode(args[2]) orelse usage(); + try runSelectorCount(io, alloc, mode, args[3], args[4]); return; } if (std.mem.eql(u8, args[1], "selector-count-scope-tag")) { if (args.len != 6) usage(); - const mode = parseMode(args[2]) orelse usage(); - try runSelectorCountScopeTag(alloc, mode, args[3], args[4], args[5]); + const mode = parse_mode.parseMode(args[2]) orelse usage(); + try runSelectorCountScopeTag(io, alloc, mode, args[3], args[4], args[5]); return; } if (std.mem.eql(u8, args[1], "parse-tags-file")) { if (args.len != 4) usage(); - const mode = parseMode(args[2]) orelse usage(); - try runParseTagsFile(alloc, mode, args[3]); + const mode = parse_mode.parseMode(args[2]) orelse usage(); + try runParseTagsFile(io, alloc, mode, args[3]); return; } diff --git a/tools/test_runner.zig b/tools/test_runner.zig new file mode 100644 index 0000000..6160221 --- /dev/null +++ b/tools/test_runner.zig @@ -0,0 +1,419 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +pub const panic = std.debug.FullPanic(panicHandler); + +var is_child_mode: bool = false; + +pub fn fuzz( + context: anytype, + comptime testOne: fn (context: @TypeOf(context), smith: *std.testing.Smith) anyerror!void, + fuzz_opts: std.testing.FuzzInputOptions, +) anyerror!void { + if (comptime builtin.fuzz) { + return fuzzBuiltin(context, testOne, fuzz_opts); + } + + if (fuzz_opts.corpus.len == 0) { + var smith: std.testing.Smith = .{ .in = "" }; + return testOne(context, &smith); + } + + for (fuzz_opts.corpus) |input| { + var smith: std.testing.Smith = .{ .in = input }; + try testOne(context, &smith); + } +} + +fn fuzzBuiltin( + context: anytype, + comptime testOne: fn (context: @TypeOf(context), smith: *std.testing.Smith) anyerror!void, + fuzz_opts: std.testing.FuzzInputOptions, +) anyerror!void { + const fuzz_abi = std.Build.abi.fuzz; + const Smith = std.testing.Smith; + const Ctx = @TypeOf(context); + + const Wrapper = struct { + var ctx: Ctx = undefined; + pub fn testOneC() callconv(.c) void { + var smith: Smith = .{ .in = null }; + testOne(ctx, &smith) catch {}; + } + }; + + Wrapper.ctx = context; + + var cache_dir: []const u8 = "."; + var map_opt: ?std.process.Environ.Map = null; + if (std.testing.environ.createMap(std.testing.allocator)) |map| { + map_opt = map; + if (map.get("ZIG_CACHE_DIR")) |v| { + cache_dir = v; + } else if (map.get("ZIG_GLOBAL_CACHE_DIR")) |v| { + cache_dir = v; + } + } else |_| {} + + fuzz_abi.fuzzer_init(.fromSlice(cache_dir)); + + const test_name = @typeName(@TypeOf(testOne)); + fuzz_abi.fuzzer_set_test(Wrapper.testOneC, .fromSlice(test_name)); + + for (fuzz_opts.corpus) |input| { + fuzz_abi.fuzzer_new_input(.fromSlice(input)); + } + + fuzz_abi.fuzzer_main(.forever, 0); + + if (map_opt) |*m| m.deinit(); +} + +pub fn main(init: std.process.Init) !void { + const threaded = std.Io.Threaded.init(init.gpa, .{ + .argv0 = .init(init.minimal.args), + .environ = init.minimal.environ, + }); + std.testing.io_instance = threaded; + defer std.testing.io_instance.deinit(); + std.testing.environ = init.minimal.environ; + + var arg_it = try std.process.Args.Iterator.initAllocator(init.minimal.args, init.gpa); + defer arg_it.deinit(); + + const argv0_z = arg_it.next() orelse "test-runner"; + const argv0 = try init.gpa.dupe(u8, argv0_z[0..argv0_z.len]); + defer init.gpa.free(argv0); + + var child_test_name: ?[]const u8 = null; + var filter: ?[]const u8 = null; + var jobs: ?usize = null; + var seed: ?u32 = null; + + while (arg_it.next()) |arg_z| { + const arg = arg_z[0..arg_z.len]; + if (std.mem.eql(u8, arg, "--zhttp-run-test")) { + const name_z = arg_it.next() orelse return error.MissingTestName; + child_test_name = try init.gpa.dupe(u8, name_z[0..name_z.len]); + } else if (std.mem.eql(u8, arg, "--test-filter")) { + const f_z = arg_it.next() orelse return error.MissingFilter; + filter = try init.gpa.dupe(u8, f_z[0..f_z.len]); + } else if (std.mem.eql(u8, arg, "--jobs")) { + const j_z = arg_it.next() orelse return error.MissingJobs; + jobs = try parseUsize(j_z[0..j_z.len]); + } else if (std.mem.eql(u8, arg, "--seed")) { + const s_z = arg_it.next() orelse return error.MissingSeed; + seed = try parseU32(s_z[0..s_z.len]); + } else if (std.mem.eql(u8, arg, "--help")) { + printHelp(); + return; + } else { + // Ignore unknown args to stay compatible with Zig's test flags. + } + } + + if (child_test_name) |name| { + is_child_mode = true; + defer init.gpa.free(name); + runSingleTest(name, seed); + return; + } + + if (filter) |f| { + defer init.gpa.free(f); + } + try runAllTests(init.gpa, init.io, argv0, filter, jobs, seed); +} + +fn panicHandler(msg: []const u8, first_trace_addr: ?usize) noreturn { + if (is_child_mode) { + std.debug.print("{s}\n", .{msg}); + std.process.exit(1); + } + std.debug.defaultPanic(msg, first_trace_addr); +} + +fn parseUsize(s: []const u8) !usize { + return std.fmt.parseUnsigned(usize, s, 10); +} + +fn parseU32(s: []const u8) !u32 { + return std.fmt.parseUnsigned(u32, s, 10); +} + +fn printHelp() void { + std.debug.print( + "Usage: test-runner [--test-filter] [--jobs ] [--seed ]\n", + .{}, + ); +} + +const Status = enum { + pass, + fail, + skip, + leak, + crash, +}; + +const Summary = struct { + pass: usize = 0, + fail: usize = 0, + skip: usize = 0, + leak: usize = 0, + crash: usize = 0, +}; + +fn runAllTests( + gpa: std.mem.Allocator, + io: std.Io, + argv0: []const u8, + filter: ?[]const u8, + jobs: ?usize, + seed: ?u32, +) !void { + var tests: std.ArrayList([]const u8) = .empty; + defer tests.deinit(gpa); + + for (builtin.test_functions) |t| { + if (filter) |f| { + if (std.mem.indexOf(u8, t.name, f) == null) continue; + } + try tests.append(gpa, t.name); + } + + if (tests.items.len == 0) { + std.debug.print("0 tests selected\n", .{}); + return; + } + + const cpu_count = std.Thread.getCpuCount() catch 1; + var job_count = jobs orelse cpu_count; + if (job_count == 0) job_count = 1; + if (job_count > tests.items.len) job_count = tests.items.len; + + var next_index: std.atomic.Value(usize) = .init(0); + var summary: Summary = .{}; + var print_mutex: std.Io.Mutex = .init; + var count_mutex: std.Io.Mutex = .init; + + var ctx = WorkerCtx{ + .gpa = gpa, + .io = io, + .argv0 = argv0, + .tests = tests.items, + .seed = seed, + .next_index = &next_index, + .summary = &summary, + .print_mutex = &print_mutex, + .count_mutex = &count_mutex, + }; + + if (builtin.single_threaded or job_count == 1) { + worker(&ctx); + } else { + const threads = try gpa.alloc(std.Thread, job_count); + defer gpa.free(threads); + for (threads, 0..) |*t, i| { + _ = i; + t.* = try std.Thread.spawn(.{}, worker, .{&ctx}); + } + for (threads) |t| t.join(); + } + + std.debug.print( + "\npass: {d} fail: {d} skip: {d} leak: {d} crash: {d}\n", + .{ summary.pass, summary.fail, summary.skip, summary.leak, summary.crash }, + ); + + if (summary.fail != 0 or summary.crash != 0 or summary.leak != 0) { + std.process.exit(1); + } +} + +const WorkerCtx = struct { + gpa: std.mem.Allocator, + io: std.Io, + argv0: []const u8, + tests: []const []const u8, + seed: ?u32, + next_index: *std.atomic.Value(usize), + summary: *Summary, + print_mutex: *std.Io.Mutex, + count_mutex: *std.Io.Mutex, +}; + +fn worker(ctx: *WorkerCtx) void { + while (true) { + const idx = ctx.next_index.fetchAdd(1, .seq_cst); + if (idx >= ctx.tests.len) break; + + const test_name = ctx.tests[idx]; + const result = runChildTest(ctx, test_name) catch |err| { + ctx.print_mutex.lockUncancelable(ctx.io); + defer ctx.print_mutex.unlock(ctx.io); + std.debug.print("\n== TEST {s} ==\nrunner error: {s}\n", .{ test_name, @errorName(err) }); + ctx.count_mutex.lockUncancelable(ctx.io); + ctx.summary.fail += 1; + ctx.count_mutex.unlock(ctx.io); + continue; + }; + defer ctx.gpa.free(result.stdout); + defer ctx.gpa.free(result.stderr); + + ctx.print_mutex.lockUncancelable(ctx.io); + defer ctx.print_mutex.unlock(ctx.io); + printTestOutput(test_name, result); + + ctx.count_mutex.lockUncancelable(ctx.io); + switch (result.status) { + .pass => ctx.summary.pass += 1, + .fail => ctx.summary.fail += 1, + .skip => ctx.summary.skip += 1, + .leak => ctx.summary.leak += 1, + .crash => ctx.summary.crash += 1, + } + ctx.count_mutex.unlock(ctx.io); + } +} + +const ChildResult = struct { + status: Status, + term: std.process.Child.Term, + stdout: []u8, + stderr: []u8, +}; + +fn runChildTest(ctx: *WorkerCtx, test_name: []const u8) !ChildResult { + var argv: std.ArrayList([]const u8) = .empty; + defer argv.deinit(ctx.gpa); + + try argv.append(ctx.gpa, ctx.argv0); + try argv.append(ctx.gpa, "--zhttp-run-test"); + try argv.append(ctx.gpa, test_name); + + var seed_buf: ?[]u8 = null; + if (ctx.seed) |s| { + const seed_str = try std.fmt.allocPrint(ctx.gpa, "{d}", .{s}); + seed_buf = seed_str; + try argv.append(ctx.gpa, "--seed"); + try argv.append(ctx.gpa, seed_str); + } + defer if (seed_buf) |b| ctx.gpa.free(b); + + const result = try std.process.run(ctx.gpa, ctx.io, .{ + .argv = argv.items, + .stdout_limit = .limited(4 * 1024 * 1024), + .stderr_limit = .limited(4 * 1024 * 1024), + }); + + const status = classifyStatus(result.term); + return .{ + .status = status, + .term = result.term, + .stdout = result.stdout, + .stderr = result.stderr, + }; +} + +fn classifyStatus(term: std.process.Child.Term) Status { + switch (term) { + .exited => |code| return switch (code) { + 0 => .pass, + 2 => .skip, + 3 => .leak, + else => .fail, + }, + .signal, .stopped, .unknown => return .crash, + } +} + +fn printTestOutput(name: []const u8, res: ChildResult) void { + const color = switch (res.status) { + .pass => "\x1b[32m", + .skip => "\x1b[94m", + else => "\x1b[31m", + }; + const label = switch (res.status) { + .pass => "ok", + .skip => "skip", + .leak => "leak", + .crash => "crash", + .fail => "error", + }; + + std.debug.print("{s}{s}\x1b[0m {s}", .{ color, label, name }); + + if (res.stdout.len > 0) { + std.debug.print(" | out: ", .{}); + printSingleLine(res.stdout, 200); + } + if (res.stderr.len > 0) { + std.debug.print(" | err: ", .{}); + printSingleLine(res.stderr, 200); + } + + switch (res.term) { + .exited => |code| if (code != 0) std.debug.print(" | exit {d}", .{code}), + .signal => |sig| std.debug.print(" | signal {d}", .{@intFromEnum(sig)}), + .stopped => |code| std.debug.print(" | stopped {d}", .{code}), + .unknown => |code| std.debug.print(" | unknown {d}", .{code}), + } + + std.debug.print("\n", .{}); +} + +fn printSingleLine(bytes: []const u8, max_len: usize) void { + var written: usize = 0; + for (bytes) |c| { + if (written >= max_len) break; + switch (c) { + '\n', '\r', '\t' => { + std.debug.print(" ", .{}); + written += 1; + }, + else => { + std.debug.print("{c}", .{c}); + written += 1; + }, + } + } + if (bytes.len > max_len) std.debug.print("...", .{}); +} + +fn runSingleTest(name: []const u8, seed: ?u32) void { + if (seed) |s| std.testing.random_seed = s; + + const test_fn = findTest(name) orelse { + std.debug.print("unknown test: {s}\n", .{name}); + std.process.exit(1); + }; + + std.testing.allocator_instance = .{}; + const result = test_fn.func(); + const leak_status = std.testing.allocator_instance.deinit(); + + if (leak_status == .leak) { + std.debug.print("memory leak\n", .{}); + std.process.exit(3); + } + + if (result) |_| { + std.process.exit(0); + } else |err| switch (err) { + error.SkipZigTest => std.process.exit(2), + else => { + std.debug.print("{s}\n", .{@errorName(err)}); + std.process.exit(1); + }, + } +} + +const TestFn = std.meta.Elem(@TypeOf(builtin.test_functions)); + +fn findTest(name: []const u8) ?TestFn { + for (builtin.test_functions) |t| { + if (std.mem.eql(u8, t.name, name)) return t; + } + return null; +} diff --git a/tools/tests/examples_tests.zig b/tools/tests/examples_tests.zig index 37f1993..30196ed 100644 --- a/tools/tests/examples_tests.zig +++ b/tools/tests/examples_tests.zig @@ -1,181 +1,37 @@ -const std = @import("std"); -const html = @import("htmlparser"); -const default_options: html.ParseOptions = .{}; -const Document = default_options.GetDocument(); +const examples = @import("examples"); test "example parity: basic parse and query" { - var doc = Document.init(std.testing.allocator); - defer doc.deinit(); - - var input = "".*; - try doc.parse(&input, .{}); - - const a = doc.queryOne("div#app > a.nav") orelse return error.TestUnexpectedResult; - try std.testing.expectEqualStrings("/docs", a.getAttributeValue("href").?); + try examples.basic_parse_query.run(); } test "example parity: runtime selectors" { - var doc = Document.init(std.testing.allocator); - defer doc.deinit(); - - var input = "".*; - try doc.parse(&input, .{}); - - try std.testing.expect((try doc.queryOneRuntime("a.primary")) != null); - - var it = try doc.queryAllRuntime("a[href]"); - try std.testing.expect(it.next() != null); - try std.testing.expect(it.next() != null); - try std.testing.expect(it.next() == null); + try examples.runtime_selector.run(); } test "example parity: cached selector" { - var doc = Document.init(std.testing.allocator); - defer doc.deinit(); - - const input = - ""; - - var buf = input.*; - try doc.parse(&buf, .{}); - - var arena = std.heap.ArenaAllocator.init(std.testing.allocator); - defer arena.deinit(); - - const sel = try html.Selector.compileRuntime(arena.allocator(), "a[href^=https][class~=button]"); - const first = doc.queryOneCached(&sel) orelse return error.TestUnexpectedResult; - try std.testing.expectEqualStrings("a1", first.getAttributeValue("id").?); + try examples.cached_selector.run(); } test "example parity: navigation and children" { - var doc = Document.init(std.testing.allocator); - defer doc.deinit(); - - var input = " ".*; - try doc.parse(&input, .{}); - - const main = doc.queryOne("main#m") orelse return error.TestUnexpectedResult; - const first = main.firstChild() orelse return error.TestUnexpectedResult; - const last = main.lastChild() orelse return error.TestUnexpectedResult; - - try std.testing.expectEqualStrings("title", first.getAttributeValue("id").?); - try std.testing.expectEqualStrings("body", last.getAttributeValue("id").?); - var children = main.children(); - var child_indexes: std.ArrayListUnmanaged(u32) = .{}; - defer child_indexes.deinit(std.testing.allocator); - try children.collect(std.testing.allocator, &child_indexes); - try std.testing.expectEqual(@as(usize, 3), child_indexes.items.len); - const first_via_index = main.doc.nodeAt(child_indexes.items[0]) orelse return error.TestUnexpectedResult; - try std.testing.expectEqualStrings("title", first_via_index.getAttributeValue("id").?); + try examples.navigation_and_children.run(); } test "example parity: innerText options" { - var doc = Document.init(std.testing.allocator); - defer doc.deinit(); - - var input = " Hello\n world &\tteam".*; - try doc.parse(&input, .{}); - - const node = doc.queryOne("div#x") orelse return error.TestUnexpectedResult; - - var arena_norm = std.heap.ArenaAllocator.init(std.testing.allocator); - defer arena_norm.deinit(); - const normalized = try node.innerText(arena_norm.allocator()); - try std.testing.expectEqualStrings("Hello world & team", normalized); - - var arena_raw = std.heap.ArenaAllocator.init(std.testing.allocator); - defer arena_raw.deinit(); - const raw = try node.innerTextWithOptions(arena_raw.allocator(), .{ .normalize_whitespace = false }); - try std.testing.expect(std.mem.indexOfScalar(u8, raw, '\n') != null); - - var arena_owned = std.heap.ArenaAllocator.init(std.testing.allocator); - defer arena_owned.deinit(); - const owned = try node.innerTextOwned(arena_owned.allocator()); - try std.testing.expectEqualStrings("Hello world & team", owned); - try std.testing.expect(!doc.isOwned(owned)); + try examples.inner_text_options.run(); } test "example parity: strictest and fastest selectors agree" { - const fixture = - "" ++ - "" ++ - ""; - - var strictest_doc = Document.init(std.testing.allocator); - defer strictest_doc.deinit(); - var strictest_buf = fixture.*; - try strictest_doc.parse(&strictest_buf, .{ - .drop_whitespace_text_nodes = false, - }); - - var fastest_doc = Document.init(std.testing.allocator); - defer fastest_doc.deinit(); - var fastest_buf = fixture.*; - try fastest_doc.parse(&fastest_buf, .{ - .drop_whitespace_text_nodes = true, - }); - - var strictest_it = strictest_doc.queryAll("li.item"); - var strictest_count: usize = 0; - while (strictest_it.next() != null) strictest_count += 1; - - var fastest_it = fastest_doc.queryAll("li.item"); - var fastest_count: usize = 0; - while (fastest_it.next() != null) fastest_count += 1; - - try std.testing.expectEqual(@as(usize, 2), strictest_count); - try std.testing.expectEqual(strictest_count, fastest_count); + try examples.strict_vs_fastest_parse.run(); } test "example parity: debug query report" { - var doc = Document.init(std.testing.allocator); - defer doc.deinit(); - - var input = "".*; - try doc.parse(&input, .{}); - - var report: html.QueryDebugReport = .{}; - const node = try doc.queryOneRuntimeDebug("a[href^=https]", &report); - try std.testing.expect(node == null); - try std.testing.expect(report.visited_elements > 0); - try std.testing.expect(report.near_miss_len > 0); - try std.testing.expect(report.near_misses[0].reason.kind != .none); + try examples.debug_query_report.run(); } test "example parity: instrumentation hooks" { - const Hooks = struct { - parse_start_calls: usize = 0, - parse_end_calls: usize = 0, - query_start_calls: usize = 0, - query_end_calls: usize = 0, - - pub fn onParseStart(self: *@This(), _: usize) void { - self.parse_start_calls += 1; - } - pub fn onParseEnd(self: *@This(), _: html.ParseInstrumentationStats) void { - self.parse_end_calls += 1; - } - pub fn onQueryStart(self: *@This(), _: html.QueryInstrumentationKind, _: usize) void { - self.query_start_calls += 1; - } - pub fn onQueryEnd(self: *@This(), _: html.QueryInstrumentationStats) void { - self.query_end_calls += 1; - } - }; - - var doc = Document.init(std.testing.allocator); - defer doc.deinit(); - var hooks: Hooks = .{}; - - var input = "
- A
- B
".*; - try html.parseWithHooks(&doc, &input, .{}, &hooks); - _ = try html.queryOneRuntimeWithHooks(&doc, "span#x", &hooks); + try examples.instrumentation_hooks.run(); +} - try std.testing.expectEqual(@as(usize, 1), hooks.parse_start_calls); - try std.testing.expectEqual(@as(usize, 1), hooks.parse_end_calls); - try std.testing.expectEqual(@as(usize, 1), hooks.query_start_calls); - try std.testing.expectEqual(@as(usize, 1), hooks.query_end_calls); +test "example parity: query-time decode" { + try examples.query_time_decode.run(); }