From e2136de9c2bad3271c7dd7079b024a0a0c4d9876 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Mon, 17 Nov 2025 18:20:19 -0300 Subject: [PATCH 01/55] Add parameter specifying the number of runs per query in `queries.cpp` --- tools/queries.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 59e7864d..711baf02 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -120,6 +120,7 @@ void op_perftest( spdlog::info("95% quantile: {}", q95); spdlog::info("99% quantile: {}", q99); spdlog::info("Num. reruns: {}", num_reruns); + spdlog::info("Num. of runs per query: {}", runs); stats_line()("type", index_type)("query", query_type)("avg", avg)("q50", q50)("q90", q90)( "q95", q95)("q99", q99); @@ -138,7 +139,8 @@ void perftest( const ScorerParams& scorer_params, const bool weighted, bool extract, - bool safe + bool safe, + std::size_t runs ) { auto const& index = *index_ptr; @@ -298,9 +300,9 @@ void perftest( break; } if (extract) { - extract_times(query_fun, queries, thresholds, type, t, 2, std::cout); + extract_times(query_fun, queries, thresholds, type, t, runs, std::cout); } else { - op_perftest(query_fun, queries, thresholds, type, t, 2, k, safe); + op_perftest(query_fun, queries, thresholds, type, t, runs, k, safe); } } } @@ -313,6 +315,7 @@ int main(int argc, const char** argv) { bool extract = false; bool safe = false; bool quantized = false; + std::size_t runs = 0; App, @@ -326,6 +329,9 @@ int main(int argc, const char** argv) { app.add_flag("--extract", extract, "Extract individual query times"); app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); + app.add_option("--runs", runs, "Number of runs per query") + ->default_val(2) + ->check(CLI::PositiveNumber); CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); @@ -348,7 +354,8 @@ int main(int argc, const char** argv) { app.scorer_params(), app.weighted(), extract, - safe + safe, + runs ); if (app.is_wand_compressed()) { if (quantized) { From 2d02ff4236fcffa8fac6450691acb089fa670030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Mon, 17 Nov 2025 18:27:50 -0300 Subject: [PATCH 02/55] Rename `num_reruns` by `corrective_rerun_count` in `queries.cpp` --- tools/queries.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 711baf02..e09ec591 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -79,7 +79,7 @@ void op_perftest( bool safe ) { std::vector query_times; - std::size_t num_reruns = 0; + std::size_t corrective_rerun_count = 0; spdlog::info("Safe: {}", safe); for (size_t run = 0; run <= runs; ++run) { @@ -88,7 +88,7 @@ void op_perftest( auto usecs = run_with_timer([&]() { uint64_t result = query_func(query, thresholds[idx]); if (safe && result < k) { - num_reruns += 1; + corrective_rerun_count += 1; result = query_func(query, 0); } do_not_optimize_away(result); @@ -119,8 +119,8 @@ void op_perftest( spdlog::info("90% quantile: {}", q90); spdlog::info("95% quantile: {}", q95); spdlog::info("99% quantile: {}", q99); - spdlog::info("Num. reruns: {}", num_reruns); - spdlog::info("Num. of runs per query: {}", runs); + spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); + spdlog::info("Runs per query (excluding warmup): {}", runs); stats_line()("type", index_type)("query", query_type)("avg", avg)("q50", q50)("q90", q90)( "q95", q95)("q99", q99); From 6df7c3cf3ad9eeb3385205f427cb4808da043ef7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 12:29:31 -0300 Subject: [PATCH 03/55] Report individual run times instead of mean in `--extract` mode --- tools/queries.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index e09ec591..d4e699cc 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -56,14 +56,17 @@ void extract_times( ) { std::vector times(runs); for (auto&& [qid, query]: enumerate(queries)) { - do_not_optimize_away(fn(query, thresholds[qid])); + do_not_optimize_away(fn(query, thresholds[qid])); // warmup std::generate(times.begin(), times.end(), [&fn, &q = query, &t = thresholds[qid]]() { return run_with_timer( [&]() { do_not_optimize_away(fn(q, t)); } ).count(); }); - auto mean = std::accumulate(times.begin(), times.end(), std::size_t{0}, std::plus<>()) / runs; - os << fmt::format("{}\t{}\n", query.id().value_or(std::to_string(qid)), mean); + os << fmt::format("{}", query.id().value_or(std::to_string(qid))); + for (auto t: times) { + os << fmt::format("\t{}", t); + } + os << "\n"; } } @@ -337,7 +340,11 @@ int main(int argc, const char** argv) { spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); spdlog::set_level(app.log_level()); if (extract) { - std::cout << "qid\tusec\n"; + std::cout << "qid"; + for (size_t i = 1; i <= runs; ++i) { + std::cout << fmt::format("\tusec{}", i); + } + std::cout<<"\n"; } run_for_index( From 28bb5b72f64333c74e7321f924a3ac83e04f4316 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 14:47:49 -0300 Subject: [PATCH 04/55] Add `--safe` flag support in `--extract` mode --- tools/queries.cpp | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index d4e699cc..69098534 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -46,28 +46,46 @@ using ranges::views::enumerate; template void extract_times( - Fn fn, + Fn query_func, std::vector const& queries, std::vector const& thresholds, std::string const& index_type, std::string const& query_type, size_t runs, + std::uint64_t k, + bool safe, std::ostream& os ) { - std::vector times(runs); + std::vector query_times(runs); + std::size_t corrective_rerun_count = 0; + spdlog::info("Safe: {}", safe); + for (auto&& [qid, query]: enumerate(queries)) { - do_not_optimize_away(fn(query, thresholds[qid])); // warmup - std::generate(times.begin(), times.end(), [&fn, &q = query, &t = thresholds[qid]]() { - return run_with_timer( - [&]() { do_not_optimize_away(fn(q, t)); } - ).count(); - }); + size_t idx = 0; + for (size_t run = 0; run <= runs; ++run) { + auto usecs = run_with_timer([&]() { + uint64_t result = query_func(query, thresholds[idx]); + if (safe && result < k) { + corrective_rerun_count += 1; + result = query_func(query, 0); + } + do_not_optimize_away(result); + }); + if (run != 0) { // first run is not timed + query_times.push_back(usecs.count()); + } + idx += 1; + } os << fmt::format("{}", query.id().value_or(std::to_string(qid))); - for (auto t: times) { + for (auto t: query_times) { os << fmt::format("\t{}", t); } os << "\n"; } + + spdlog::info("---- {} {}", index_type, query_type); + spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); + spdlog::info("Runs per query (excluding warmup): {}", runs); } template @@ -303,7 +321,7 @@ void perftest( break; } if (extract) { - extract_times(query_fun, queries, thresholds, type, t, runs, std::cout); + extract_times(query_fun, queries, thresholds, type, t, runs, k, safe, std::cout); } else { op_perftest(query_fun, queries, thresholds, type, t, runs, k, safe); } From 9137d12cef3ee43d6bce7f0bb53ed232d5ebc7c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 15:24:44 -0300 Subject: [PATCH 05/55] Modify `--extract` mode to measure each query independently per run --- tools/queries.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 69098534..3d419b2e 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -56,15 +56,17 @@ void extract_times( bool safe, std::ostream& os ) { - std::vector query_times(runs); + std::vector> query_times(queries.size(), std::vector(runs)); std::size_t corrective_rerun_count = 0; spdlog::info("Safe: {}", safe); - for (auto&& [qid, query]: enumerate(queries)) { - size_t idx = 0; - for (size_t run = 0; run <= runs; ++run) { + // Note: each query is measured once per run, so the set of queries is + // measured independently in each run. + for (size_t run = 0; run <= runs; ++run) { + size_t query_idx = 0; + for (auto const& query: queries) { auto usecs = run_with_timer([&]() { - uint64_t result = query_func(query, thresholds[idx]); + uint64_t result = query_func(query, thresholds[query_idx]); if (safe && result < k) { corrective_rerun_count += 1; result = query_func(query, 0); @@ -72,12 +74,16 @@ void extract_times( do_not_optimize_away(result); }); if (run != 0) { // first run is not timed - query_times.push_back(usecs.count()); + query_times[query_idx][run - 1] = usecs.count(); } - idx += 1; + query_idx += 1; } - os << fmt::format("{}", query.id().value_or(std::to_string(qid))); - for (auto t: query_times) { + } + + // Print timing results for each query. + for (auto&& [query_idx, query]: enumerate(queries)) { + os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); + for (auto t: query_times[query_idx]) { os << fmt::format("\t{}", t); } os << "\n"; From 972f0d64655cd78007cff00a5199d5865c31cf74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 16:28:10 -0300 Subject: [PATCH 06/55] Modify `--extract` mode to print query stats without aggregation --- tools/queries.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/queries.cpp b/tools/queries.cpp index 3d419b2e..c89cb9ec 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -44,6 +44,41 @@ using namespace pisa; using ranges::views::enumerate; +enum class AggregationType { None = 0 }; + +[[nodiscard]] auto to_string(AggregationType type) -> std::string { + switch (type) { + case AggregationType::None: return "none"; + } + return "unknown"; +} + +std::vector aggregate_and_sort_query_times( + AggregationType aggregation_type, + std::vector> const& query_times +) { + std::vector aggregated_query_times; + if (aggregation_type == AggregationType::None) { + for (auto const& times: query_times) { + for (auto t: times) { + aggregated_query_times.push_back(t); + } + } + } + std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); + return aggregated_query_times; +} + +void print_stats(AggregationType aggregation_type, std::vector const& query_times) { + double mean = std::accumulate(query_times.begin(), query_times.end(), 0.0) + / query_times.size(); + double q50 = query_times[query_times.size() / 2]; + double q90 = query_times[90 * query_times.size() / 100]; + double q95 = query_times[95 * query_times.size() / 100]; + double q99 = query_times[99 * query_times.size() / 100]; + stats_line()("aggregated_by", to_string(aggregation_type))("mean", mean)("q50", q50)("q90", q90)("q95", q95)("q99", q99); +} + template void extract_times( Fn query_func, @@ -92,6 +127,7 @@ void extract_times( spdlog::info("---- {} {}", index_type, query_type); spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); spdlog::info("Runs per query (excluding warmup): {}", runs); + print_stats(AggregationType::None, aggregate_and_sort_query_times(AggregationType::None, query_times)); } template From b1673f3c47e986f504b434cf68969c7d9ea713a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 16:33:44 -0300 Subject: [PATCH 07/55] Modify `--extract` mode to print query stats aggregating by min --- tools/queries.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index c89cb9ec..6824298b 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -44,11 +44,12 @@ using namespace pisa; using ranges::views::enumerate; -enum class AggregationType { None = 0 }; +enum class AggregationType { None = 0, Min = 1, }; [[nodiscard]] auto to_string(AggregationType type) -> std::string { switch (type) { case AggregationType::None: return "none"; + case AggregationType::Min: return "min"; } return "unknown"; } @@ -64,13 +65,17 @@ std::vector aggregate_and_sort_query_times( aggregated_query_times.push_back(t); } } + } else if (aggregation_type == AggregationType::Min) { + for (auto const& times: query_times) { + aggregated_query_times.push_back(*std::min_element(times.begin(), times.end())); + } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); return aggregated_query_times; } void print_stats(AggregationType aggregation_type, std::vector const& query_times) { - double mean = std::accumulate(query_times.begin(), query_times.end(), 0.0) + double mean = std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); double q50 = query_times[query_times.size() / 2]; double q90 = query_times[90 * query_times.size() / 100]; @@ -128,6 +133,7 @@ void extract_times( spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); spdlog::info("Runs per query (excluding warmup): {}", runs); print_stats(AggregationType::None, aggregate_and_sort_query_times(AggregationType::None, query_times)); + print_stats(AggregationType::Min, aggregate_and_sort_query_times(AggregationType::Min, query_times)); } template From 78baedf3f29cd2197b2ffd926a8d71dc9bb9adb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 16:43:00 -0300 Subject: [PATCH 08/55] Modify `--extract` mode to print query stats aggregating by mean --- tools/queries.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 6824298b..ae686266 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -44,12 +44,13 @@ using namespace pisa; using ranges::views::enumerate; -enum class AggregationType { None = 0, Min = 1, }; +enum class AggregationType { None = 0, Min = 1, Mean = 2, }; [[nodiscard]] auto to_string(AggregationType type) -> std::string { switch (type) { case AggregationType::None: return "none"; case AggregationType::Min: return "min"; + case AggregationType::Mean: return "mean"; } return "unknown"; } @@ -69,6 +70,12 @@ std::vector aggregate_and_sort_query_times( for (auto const& times: query_times) { aggregated_query_times.push_back(*std::min_element(times.begin(), times.end())); } + } else if (aggregation_type == AggregationType::Mean) { + for (auto const& times: query_times) { + double sum = std::accumulate(times.begin(), times.end(), double()); + double mean = sum / times.size(); + aggregated_query_times.push_back(mean); + } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); return aggregated_query_times; @@ -134,6 +141,7 @@ void extract_times( spdlog::info("Runs per query (excluding warmup): {}", runs); print_stats(AggregationType::None, aggregate_and_sort_query_times(AggregationType::None, query_times)); print_stats(AggregationType::Min, aggregate_and_sort_query_times(AggregationType::Min, query_times)); + print_stats(AggregationType::Mean, aggregate_and_sort_query_times(AggregationType::Mean, query_times)); } template From 1d4a48b0633ca961ddb1fa947bfda6198e46eede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 17:13:38 -0300 Subject: [PATCH 09/55] Modify `--extract` mode to print query stats aggregating by median --- tools/queries.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index ae686266..bdf3003a 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -44,13 +44,14 @@ using namespace pisa; using ranges::views::enumerate; -enum class AggregationType { None = 0, Min = 1, Mean = 2, }; +enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3}; [[nodiscard]] auto to_string(AggregationType type) -> std::string { switch (type) { case AggregationType::None: return "none"; case AggregationType::Min: return "min"; case AggregationType::Mean: return "mean"; + case AggregationType::Median: return "median"; } return "unknown"; } @@ -76,6 +77,19 @@ std::vector aggregate_and_sort_query_times( double mean = sum / times.size(); aggregated_query_times.push_back(mean); } + } else if (aggregation_type == AggregationType::Median) { + for (auto const& times: query_times) { + auto sorted_times = times; + std::sort(sorted_times.begin(), sorted_times.end()); + std::size_t sample_count = sorted_times.size(); + double median = 0; + if (sample_count % 2 == 1) { + median = sorted_times[sample_count / 2]; + } else { + median = (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) / 2; + } + aggregated_query_times.push_back(median); + } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); return aggregated_query_times; @@ -142,6 +156,7 @@ void extract_times( print_stats(AggregationType::None, aggregate_and_sort_query_times(AggregationType::None, query_times)); print_stats(AggregationType::Min, aggregate_and_sort_query_times(AggregationType::Min, query_times)); print_stats(AggregationType::Mean, aggregate_and_sort_query_times(AggregationType::Mean, query_times)); + print_stats(AggregationType::Median, aggregate_and_sort_query_times(AggregationType::Median, query_times)); } template From 46ac2657bcaedbdf6e51af127a5a1900f4af5313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 17:20:09 -0300 Subject: [PATCH 10/55] Modify `--extract` mode to print query stats aggregating by max --- tools/queries.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index bdf3003a..8dd60e06 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -44,7 +44,7 @@ using namespace pisa; using ranges::views::enumerate; -enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3}; +enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4}; [[nodiscard]] auto to_string(AggregationType type) -> std::string { switch (type) { @@ -52,6 +52,7 @@ enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3}; case AggregationType::Min: return "min"; case AggregationType::Mean: return "mean"; case AggregationType::Median: return "median"; + case AggregationType::Max: return "max"; } return "unknown"; } @@ -90,6 +91,10 @@ std::vector aggregate_and_sort_query_times( } aggregated_query_times.push_back(median); } + } else if (aggregation_type == AggregationType::Max) { + for (auto const& times: query_times) { + aggregated_query_times.push_back(*std::max_element(times.begin(), times.end())); + } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); return aggregated_query_times; @@ -157,6 +162,7 @@ void extract_times( print_stats(AggregationType::Min, aggregate_and_sort_query_times(AggregationType::Min, query_times)); print_stats(AggregationType::Mean, aggregate_and_sort_query_times(AggregationType::Mean, query_times)); print_stats(AggregationType::Median, aggregate_and_sort_query_times(AggregationType::Median, query_times)); + print_stats(AggregationType::Max, aggregate_and_sort_query_times(AggregationType::Max, query_times)); } template From 67a45ead6b4a0c07f0b5cbcc307f68d56eebf15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 17:35:28 -0300 Subject: [PATCH 11/55] Improve variable naming --- tools/queries.cpp | 50 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 8dd60e06..70c665ad 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -57,43 +57,43 @@ enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4}; return "unknown"; } -std::vector aggregate_and_sort_query_times( +std::vector aggregate_and_sort_times_per_query( AggregationType aggregation_type, - std::vector> const& query_times + std::vector> const& times_per_query ) { std::vector aggregated_query_times; if (aggregation_type == AggregationType::None) { - for (auto const& times: query_times) { - for (auto t: times) { + for (auto const& query_times: times_per_query) { + for (auto t: query_times) { aggregated_query_times.push_back(t); } } } else if (aggregation_type == AggregationType::Min) { - for (auto const& times: query_times) { - aggregated_query_times.push_back(*std::min_element(times.begin(), times.end())); + for (auto const& query_times: times_per_query) { + aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end())); } } else if (aggregation_type == AggregationType::Mean) { - for (auto const& times: query_times) { - double sum = std::accumulate(times.begin(), times.end(), double()); - double mean = sum / times.size(); + for (auto const& query_times: times_per_query) { + double sum = std::accumulate(query_times.begin(), query_times.end(), double()); + double mean = sum / query_times.size(); aggregated_query_times.push_back(mean); } } else if (aggregation_type == AggregationType::Median) { - for (auto const& times: query_times) { - auto sorted_times = times; - std::sort(sorted_times.begin(), sorted_times.end()); - std::size_t sample_count = sorted_times.size(); + for (auto const& query_times: times_per_query) { + auto sorted_query_times = query_times; + std::sort(sorted_query_times.begin(), sorted_query_times.end()); + std::size_t sample_count = sorted_query_times.size(); double median = 0; if (sample_count % 2 == 1) { - median = sorted_times[sample_count / 2]; + median = sorted_query_times[sample_count / 2]; } else { - median = (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) / 2; + median = (sorted_query_times[sample_count / 2] + sorted_query_times[sample_count / 2 - 1]) / 2; } aggregated_query_times.push_back(median); } } else if (aggregation_type == AggregationType::Max) { - for (auto const& times: query_times) { - aggregated_query_times.push_back(*std::max_element(times.begin(), times.end())); + for (auto const& query_times: times_per_query) { + aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end())); } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); @@ -122,7 +122,7 @@ void extract_times( bool safe, std::ostream& os ) { - std::vector> query_times(queries.size(), std::vector(runs)); + std::vector> times_per_query(queries.size(), std::vector(runs)); std::size_t corrective_rerun_count = 0; spdlog::info("Safe: {}", safe); @@ -140,7 +140,7 @@ void extract_times( do_not_optimize_away(result); }); if (run != 0) { // first run is not timed - query_times[query_idx][run - 1] = usecs.count(); + times_per_query[query_idx][run - 1] = usecs.count(); } query_idx += 1; } @@ -149,7 +149,7 @@ void extract_times( // Print timing results for each query. for (auto&& [query_idx, query]: enumerate(queries)) { os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); - for (auto t: query_times[query_idx]) { + for (auto t: times_per_query[query_idx]) { os << fmt::format("\t{}", t); } os << "\n"; @@ -158,11 +158,11 @@ void extract_times( spdlog::info("---- {} {}", index_type, query_type); spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); spdlog::info("Runs per query (excluding warmup): {}", runs); - print_stats(AggregationType::None, aggregate_and_sort_query_times(AggregationType::None, query_times)); - print_stats(AggregationType::Min, aggregate_and_sort_query_times(AggregationType::Min, query_times)); - print_stats(AggregationType::Mean, aggregate_and_sort_query_times(AggregationType::Mean, query_times)); - print_stats(AggregationType::Median, aggregate_and_sort_query_times(AggregationType::Median, query_times)); - print_stats(AggregationType::Max, aggregate_and_sort_query_times(AggregationType::Max, query_times)); + print_stats(AggregationType::None, aggregate_and_sort_times_per_query(AggregationType::None, times_per_query)); + print_stats(AggregationType::Min, aggregate_and_sort_times_per_query(AggregationType::Min, times_per_query)); + print_stats(AggregationType::Mean, aggregate_and_sort_times_per_query(AggregationType::Mean, times_per_query)); + print_stats(AggregationType::Median, aggregate_and_sort_times_per_query(AggregationType::Median, times_per_query)); + print_stats(AggregationType::Max, aggregate_and_sort_times_per_query(AggregationType::Max, times_per_query)); } template From b60c5aacc110362132620a4a5409323c4743948d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 18:36:38 -0300 Subject: [PATCH 12/55] Extend `stats_line()` to write in a configurable output --- include/pisa/util/util.hpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/include/pisa/util/util.hpp b/include/pisa/util/util.hpp index aa2811f2..834290bd 100644 --- a/include/pisa/util/util.hpp +++ b/include/pisa/util/util.hpp @@ -113,23 +113,24 @@ function_iterator make_function_iterator( } struct stats_line { - stats_line() { std::cout << "{"; } + stats_line() : m_out(std::cout) { m_out << "{"; } + explicit stats_line(std::ostream& out) : m_out(out) { m_out << "{"; } stats_line(stats_line const&) = default; stats_line(stats_line&&) noexcept = default; stats_line& operator=(stats_line const&) = default; stats_line& operator=(stats_line&&) noexcept = default; - ~stats_line() { std::cout << "}" << std::endl; } + ~stats_line() { m_out << "}" << std::endl; } template stats_line& operator()(K const& key, T const& value) { if (!first) { - std::cout << ", "; + m_out << ", "; } else { first = false; } emit(key); - std::cout << ": "; + m_out << ": "; emit(value); return *this; } @@ -142,27 +143,27 @@ struct stats_line { private: template void emit(T const& v) const { - std::cout << v; + m_out << v; } // XXX properly escape strings - void emit(const char* s) const { std::cout << '"' << s << '"'; } + void emit(const char* s) const { m_out << '"' << s << '"'; } void emit(std::string const& s) const { emit(s.c_str()); } template void emit(std::vector const& v) const { - std::cout << "["; + m_out << "["; bool first = true; for (auto const& i: v) { if (first) { first = false; } else { - std::cout << ", "; + m_out << ", "; } emit(i); } - std::cout << "]"; + m_out << "]"; } template @@ -174,7 +175,7 @@ struct stats_line { template typename std::enable_if::type emit_tuple_helper(Tuple const& t) const { emit_tuple_helper(t); - std::cout << ", "; + m_out << ", "; emit(std::get(t)); } @@ -185,9 +186,9 @@ struct stats_line { template void emit(std::tuple const& t) const { - std::cout << "["; + m_out << "["; emit_tuple_helper, sizeof...(Tp) - 1>(t); - std::cout << "]"; + m_out << "]"; } template @@ -195,6 +196,7 @@ struct stats_line { emit(std::make_tuple(p.first, p.second)); } + std::ostream& m_out; bool first{true}; }; From 3c332b7b9326e336a1eeb9717ca964918f32f3e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 18:39:07 -0300 Subject: [PATCH 13/55] Use `std::cerr` to print query stats --- tools/queries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 70c665ad..2adbb29b 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -107,7 +107,7 @@ void print_stats(AggregationType aggregation_type, std::vector cons double q90 = query_times[90 * query_times.size() / 100]; double q95 = query_times[95 * query_times.size() / 100]; double q99 = query_times[99 * query_times.size() / 100]; - stats_line()("aggregated_by", to_string(aggregation_type))("mean", mean)("q50", q50)("q90", q90)("q95", q95)("q99", q99); + stats_line(std::cerr)("aggregated_by", to_string(aggregation_type))("mean", mean)("q50", q50)("q90", q90)("q95", q95)("q99", q99); } template From 7cf768dec95c52d5e6404c189f6a0d9a129838e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 23:13:04 -0300 Subject: [PATCH 14/55] Implement aggregation adding the `--aggregate-by` option --- tools/queries.cpp | 73 +++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 2adbb29b..e13640ad 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -120,6 +121,7 @@ void extract_times( size_t runs, std::uint64_t k, bool safe, + AggregationType aggregate_by, std::ostream& os ) { std::vector> times_per_query(queries.size(), std::vector(runs)); @@ -146,23 +148,41 @@ void extract_times( } } - // Print timing results for each query. - for (auto&& [query_idx, query]: enumerate(queries)) { - os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); - for (auto t: times_per_query[query_idx]) { - os << fmt::format("\t{}", t); - } - os << "\n"; - } - spdlog::info("---- {} {}", index_type, query_type); spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); spdlog::info("Runs per query (excluding warmup): {}", runs); - print_stats(AggregationType::None, aggregate_and_sort_times_per_query(AggregationType::None, times_per_query)); - print_stats(AggregationType::Min, aggregate_and_sort_times_per_query(AggregationType::Min, times_per_query)); - print_stats(AggregationType::Mean, aggregate_and_sort_times_per_query(AggregationType::Mean, times_per_query)); - print_stats(AggregationType::Median, aggregate_and_sort_times_per_query(AggregationType::Median, times_per_query)); - print_stats(AggregationType::Max, aggregate_and_sort_times_per_query(AggregationType::Max, times_per_query)); + + if (aggregate_by == AggregationType::None) { + auto print_aggregated_stats = [&](AggregationType type) { + print_stats(type, aggregate_and_sort_times_per_query(type, times_per_query)); + }; + print_aggregated_stats(AggregationType::None); + print_aggregated_stats(AggregationType::Min); + print_aggregated_stats(AggregationType::Mean); + print_aggregated_stats(AggregationType::Median); + print_aggregated_stats(AggregationType::Max); + + std::cout << "qid"; + for (size_t i = 1; i <= runs; ++i) { + std::cout << fmt::format("\tusec{}", i); + } + std::cout<<"\n"; + for (auto&& [query_idx, query]: enumerate(queries)) { + os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); + for (auto t: times_per_query[query_idx]) { + os << fmt::format("\t{}", t); + } + os << "\n"; + } + } else { + auto aggregated_query_times = aggregate_and_sort_times_per_query(aggregate_by, times_per_query); + print_stats(aggregate_by, aggregated_query_times); + + std::cout << fmt::format("qid\tusec_{}", to_string(aggregate_by)) << "\n"; + for (auto&& [query_idx, query]: enumerate(queries)) { + os << fmt::format("{}\t{}\n", query.id().value_or(std::to_string(query_idx)), aggregated_query_times[query_idx]); + } + } } template @@ -238,7 +258,8 @@ void perftest( const bool weighted, bool extract, bool safe, - std::size_t runs + std::size_t runs, + AggregationType aggregate_by ) { auto const& index = *index_ptr; @@ -398,7 +419,7 @@ void perftest( break; } if (extract) { - extract_times(query_fun, queries, thresholds, type, t, runs, k, safe, std::cout); + extract_times(query_fun, queries, thresholds, type, t, runs, k, safe, aggregate_by, std::cout); } else { op_perftest(query_fun, queries, thresholds, type, t, runs, k, safe); } @@ -414,6 +435,7 @@ int main(int argc, const char** argv) { bool safe = false; bool quantized = false; std::size_t runs = 0; + AggregationType aggregate_by = AggregationType::None; App, @@ -430,17 +452,19 @@ int main(int argc, const char** argv) { app.add_option("--runs", runs, "Number of runs per query") ->default_val(2) ->check(CLI::PositiveNumber); + app.add_option("--aggregate-by", aggregate_by, "Aggregation mode for results per query") + ->transform(CLI::CheckedTransformer(std::map{ + {"none", AggregationType::None}, + {"min", AggregationType::Min}, + {"mean", AggregationType::Mean}, + {"median", AggregationType::Median}, + {"max", AggregationType::Max}, + })) + ->default_val("none"); CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); spdlog::set_level(app.log_level()); - if (extract) { - std::cout << "qid"; - for (size_t i = 1; i <= runs; ++i) { - std::cout << fmt::format("\tusec{}", i); - } - std::cout<<"\n"; - } run_for_index( app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { @@ -457,7 +481,8 @@ int main(int argc, const char** argv) { app.weighted(), extract, safe, - runs + runs, + aggregate_by ); if (app.is_wand_compressed()) { if (quantized) { From 042f4c2c3a7b4cb5389a6b32da2a8d0dea20014f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 18 Nov 2025 23:57:39 -0300 Subject: [PATCH 15/55] Add `--summary-only` to just print summary, ommiting per-query results --- tools/queries.cpp | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index e13640ad..87afde5d 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -122,6 +122,7 @@ void extract_times( std::uint64_t k, bool safe, AggregationType aggregate_by, + bool summary_only, std::ostream& os ) { std::vector> times_per_query(queries.size(), std::vector(runs)); @@ -162,25 +163,31 @@ void extract_times( print_aggregated_stats(AggregationType::Median); print_aggregated_stats(AggregationType::Max); - std::cout << "qid"; - for (size_t i = 1; i <= runs; ++i) { - std::cout << fmt::format("\tusec{}", i); - } - std::cout<<"\n"; - for (auto&& [query_idx, query]: enumerate(queries)) { - os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); - for (auto t: times_per_query[query_idx]) { - os << fmt::format("\t{}", t); + if (!summary_only) { + std::cout << "qid"; + for (size_t i = 1; i <= runs; ++i) { + std::cout << fmt::format("\tusec{}", i); + } + std::cout << "\n"; + for (auto&& [query_idx, query]: enumerate(queries)) { + os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); + for (auto t: times_per_query[query_idx]) { + os << fmt::format("\t{}", t); + } + os << "\n"; } - os << "\n"; } } else { auto aggregated_query_times = aggregate_and_sort_times_per_query(aggregate_by, times_per_query); print_stats(aggregate_by, aggregated_query_times); - std::cout << fmt::format("qid\tusec_{}", to_string(aggregate_by)) << "\n"; - for (auto&& [query_idx, query]: enumerate(queries)) { - os << fmt::format("{}\t{}\n", query.id().value_or(std::to_string(query_idx)), aggregated_query_times[query_idx]); + if (!summary_only) { + std::cout << fmt::format("qid\tusec_{}", to_string(aggregate_by)) << "\n"; + for (auto&& [query_idx, query]: enumerate(queries)) { + os << fmt::format( + "{}\t{}\n", query.id().value_or(std::to_string(query_idx)), aggregated_query_times[query_idx] + ); + } } } } @@ -259,7 +266,8 @@ void perftest( bool extract, bool safe, std::size_t runs, - AggregationType aggregate_by + AggregationType aggregate_by, + bool summary_only ) { auto const& index = *index_ptr; @@ -419,7 +427,9 @@ void perftest( break; } if (extract) { - extract_times(query_fun, queries, thresholds, type, t, runs, k, safe, aggregate_by, std::cout); + extract_times( + query_fun, queries, thresholds, type, t, runs, k, safe, aggregate_by, summary_only, std::cout + ); } else { op_perftest(query_fun, queries, thresholds, type, t, runs, k, safe); } @@ -434,6 +444,7 @@ int main(int argc, const char** argv) { bool extract = false; bool safe = false; bool quantized = false; + bool summary_only = false; std::size_t runs = 0; AggregationType aggregate_by = AggregationType::None; @@ -449,6 +460,7 @@ int main(int argc, const char** argv) { app.add_flag("--extract", extract, "Extract individual query times"); app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); + app.add_flag("--summary-only", summary_only, "Only print summary stats, ommiting per-query results"); app.add_option("--runs", runs, "Number of runs per query") ->default_val(2) ->check(CLI::PositiveNumber); @@ -482,7 +494,8 @@ int main(int argc, const char** argv) { extract, safe, runs, - aggregate_by + aggregate_by, + summary_only ); if (app.is_wand_compressed()) { if (quantized) { From dad7d298f1e5e9ad1746900e581d0b1fc47eceee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 19 Nov 2025 00:05:43 -0300 Subject: [PATCH 16/55] Remove `--extract` option and `op_perftest()` function --- tools/queries.cpp | 74 ++--------------------------------------------- 1 file changed, 3 insertions(+), 71 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 87afde5d..83e17703 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -192,66 +192,6 @@ void extract_times( } } -template -void op_perftest( - Functor query_func, - std::vector const& queries, - std::vector const& thresholds, - std::string const& index_type, - std::string const& query_type, - size_t runs, - std::uint64_t k, - bool safe -) { - std::vector query_times; - std::size_t corrective_rerun_count = 0; - spdlog::info("Safe: {}", safe); - - for (size_t run = 0; run <= runs; ++run) { - size_t idx = 0; - for (auto const& query: queries) { - auto usecs = run_with_timer([&]() { - uint64_t result = query_func(query, thresholds[idx]); - if (safe && result < k) { - corrective_rerun_count += 1; - result = query_func(query, 0); - } - do_not_optimize_away(result); - }); - if (run != 0) { // first run is not timed - query_times.push_back(usecs.count()); - } - idx += 1; - } - } - - if (false) { - for (auto t: query_times) { - std::cout << (t / 1000) << std::endl; - } - } else { - std::sort(query_times.begin(), query_times.end()); - double avg = - std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); - double q50 = query_times[query_times.size() / 2]; - double q90 = query_times[90 * query_times.size() / 100]; - double q95 = query_times[95 * query_times.size() / 100]; - double q99 = query_times[99 * query_times.size() / 100]; - - spdlog::info("---- {} {}", index_type, query_type); - spdlog::info("Mean: {}", avg); - spdlog::info("50% quantile: {}", q50); - spdlog::info("90% quantile: {}", q90); - spdlog::info("95% quantile: {}", q95); - spdlog::info("99% quantile: {}", q99); - spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); - spdlog::info("Runs per query (excluding warmup): {}", runs); - - stats_line()("type", index_type)("query", query_type)("avg", avg)("q50", q50)("q90", q90)( - "q95", q95)("q99", q99); - } -} - template void perftest( IndexType const* index_ptr, @@ -263,7 +203,6 @@ void perftest( uint64_t k, const ScorerParams& scorer_params, const bool weighted, - bool extract, bool safe, std::size_t runs, AggregationType aggregate_by, @@ -426,13 +365,9 @@ void perftest( spdlog::error("Unsupported query type: {}", t); break; } - if (extract) { - extract_times( - query_fun, queries, thresholds, type, t, runs, k, safe, aggregate_by, summary_only, std::cout - ); - } else { - op_perftest(query_fun, queries, thresholds, type, t, runs, k, safe); - } + extract_times( + query_fun, queries, thresholds, type, t, runs, k, safe, aggregate_by, summary_only, std::cout + ); } } @@ -441,7 +376,6 @@ using wand_uniform_index = wand_data>; using wand_uniform_index_quantized = wand_data>; int main(int argc, const char** argv) { - bool extract = false; bool safe = false; bool quantized = false; bool summary_only = false; @@ -457,7 +391,6 @@ int main(int argc, const char** argv) { arg::LogLevel> app{"Benchmarks queries on a given index."}; app.add_flag("--quantized", quantized, "Quantized scores"); - app.add_flag("--extract", extract, "Extract individual query times"); app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); app.add_flag("--summary-only", summary_only, "Only print summary stats, ommiting per-query results"); @@ -491,7 +424,6 @@ int main(int argc, const char** argv) { app.k(), app.scorer_params(), app.weighted(), - extract, safe, runs, aggregate_by, From 8813f94bb87cf6a0180c15c1d3e1b8b838009f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 19 Nov 2025 00:42:22 -0300 Subject: [PATCH 17/55] Modify the default number of runs from 2 to 3 --- tools/queries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 83e17703..f7d271eb 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -395,7 +395,7 @@ int main(int argc, const char** argv) { ->needs(app.thresholds_option()); app.add_flag("--summary-only", summary_only, "Only print summary stats, ommiting per-query results"); app.add_option("--runs", runs, "Number of runs per query") - ->default_val(2) + ->default_val(3) ->check(CLI::PositiveNumber); app.add_option("--aggregate-by", aggregate_by, "Aggregation mode for results per query") ->transform(CLI::CheckedTransformer(std::map{ From d3c85af1c4957921f41503ad065290e959b9f42e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 19 Nov 2025 00:50:52 -0300 Subject: [PATCH 18/55] Format `queries.cpp` --- tools/queries.cpp | 102 ++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index f7d271eb..ccb82ca6 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -1,9 +1,9 @@ #include #include +#include #include #include #include -#include #include #include @@ -45,22 +45,21 @@ using namespace pisa; using ranges::views::enumerate; -enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4}; +enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; [[nodiscard]] auto to_string(AggregationType type) -> std::string { switch (type) { - case AggregationType::None: return "none"; - case AggregationType::Min: return "min"; - case AggregationType::Mean: return "mean"; - case AggregationType::Median: return "median"; - case AggregationType::Max: return "max"; + case AggregationType::None: return "none"; + case AggregationType::Min: return "min"; + case AggregationType::Mean: return "mean"; + case AggregationType::Median: return "median"; + case AggregationType::Max: return "max"; } return "unknown"; } std::vector aggregate_and_sort_times_per_query( - AggregationType aggregation_type, - std::vector> const& times_per_query + AggregationType aggregation_type, std::vector> const& times_per_query ) { std::vector aggregated_query_times; if (aggregation_type == AggregationType::None) { @@ -71,7 +70,8 @@ std::vector aggregate_and_sort_times_per_query( } } else if (aggregation_type == AggregationType::Min) { for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end())); + aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end()) + ); } } else if (aggregation_type == AggregationType::Mean) { for (auto const& query_times: times_per_query) { @@ -88,13 +88,16 @@ std::vector aggregate_and_sort_times_per_query( if (sample_count % 2 == 1) { median = sorted_query_times[sample_count / 2]; } else { - median = (sorted_query_times[sample_count / 2] + sorted_query_times[sample_count / 2 - 1]) / 2; + median = + (sorted_query_times[sample_count / 2] + sorted_query_times[sample_count / 2 - 1]) + / 2; } aggregated_query_times.push_back(median); } } else if (aggregation_type == AggregationType::Max) { for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end())); + aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end()) + ); } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); @@ -102,8 +105,8 @@ std::vector aggregate_and_sort_times_per_query( } void print_stats(AggregationType aggregation_type, std::vector const& query_times) { - double mean = std::accumulate(query_times.begin(), query_times.end(), double()) - / query_times.size(); + double mean = + std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); double q50 = query_times[query_times.size() / 2]; double q90 = query_times[90 * query_times.size() / 100]; double q95 = query_times[95 * query_times.size() / 100]; @@ -125,7 +128,9 @@ void extract_times( bool summary_only, std::ostream& os ) { - std::vector> times_per_query(queries.size(), std::vector(runs)); + std::vector> times_per_query( + queries.size(), std::vector(runs) + ); std::size_t corrective_rerun_count = 0; spdlog::info("Safe: {}", safe); @@ -142,7 +147,7 @@ void extract_times( } do_not_optimize_away(result); }); - if (run != 0) { // first run is not timed + if (run != 0) { // first run is not timed times_per_query[query_idx][run - 1] = usecs.count(); } query_idx += 1; @@ -178,14 +183,17 @@ void extract_times( } } } else { - auto aggregated_query_times = aggregate_and_sort_times_per_query(aggregate_by, times_per_query); + auto aggregated_query_times = + aggregate_and_sort_times_per_query(aggregate_by, times_per_query); print_stats(aggregate_by, aggregated_query_times); if (!summary_only) { std::cout << fmt::format("qid\tusec_{}", to_string(aggregate_by)) << "\n"; for (auto&& [query_idx, query]: enumerate(queries)) { os << fmt::format( - "{}\t{}\n", query.id().value_or(std::to_string(query_idx)), aggregated_query_times[query_idx] + "{}\t{}\n", + query.id().value_or(std::to_string(query_idx)), + aggregated_query_times[query_idx] ); } } @@ -393,10 +401,10 @@ int main(int argc, const char** argv) { app.add_flag("--quantized", quantized, "Quantized scores"); app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); - app.add_flag("--summary-only", summary_only, "Only print summary stats, ommiting per-query results"); - app.add_option("--runs", runs, "Number of runs per query") - ->default_val(3) - ->check(CLI::PositiveNumber); + app.add_flag( + "--summary-only", summary_only, "Only print summary stats, ommiting per-query results" + ); + app.add_option("--runs", runs, "Number of runs per query")->default_val(3)->check(CLI::PositiveNumber); app.add_option("--aggregate-by", aggregate_by, "Aggregation mode for results per query") ->transform(CLI::CheckedTransformer(std::map{ {"none", AggregationType::None}, @@ -411,33 +419,31 @@ int main(int argc, const char** argv) { spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); spdlog::set_level(app.log_level()); - run_for_index( - app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { - using Index = std::decay_t; - auto params = std::make_tuple( - &index, - app.wand_data_path(), - app.queries(), - app.thresholds_file(), - app.index_encoding(), - app.algorithm(), - app.k(), - app.scorer_params(), - app.weighted(), - safe, - runs, - aggregate_by, - summary_only - ); - if (app.is_wand_compressed()) { - if (quantized) { - std::apply(perftest, params); - } else { - std::apply(perftest, params); - } + run_for_index(app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { + using Index = std::decay_t; + auto params = std::make_tuple( + &index, + app.wand_data_path(), + app.queries(), + app.thresholds_file(), + app.index_encoding(), + app.algorithm(), + app.k(), + app.scorer_params(), + app.weighted(), + safe, + runs, + aggregate_by, + summary_only + ); + if (app.is_wand_compressed()) { + if (quantized) { + std::apply(perftest, params); } else { - std::apply(perftest, params); + std::apply(perftest, params); } + } else { + std::apply(perftest, params); } - ); + }); } From 0b01c9bdeb23092ef6ca36f4b09b7a0f07598cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 19 Nov 2025 00:56:16 -0300 Subject: [PATCH 19/55] Remove unnecessary `map` import --- tools/queries.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index ccb82ca6..f772985e 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include #include From 3832bdd5366ae270563102e72a024ca66b8e2d51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 19 Nov 2025 01:24:57 -0300 Subject: [PATCH 20/55] Format `queries.cpp` --- tools/queries.cpp | 74 +++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index f772985e..a118c488 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -69,8 +69,7 @@ std::vector aggregate_and_sort_times_per_query( } } else if (aggregation_type == AggregationType::Min) { for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end()) - ); + aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end())); } } else if (aggregation_type == AggregationType::Mean) { for (auto const& query_times: times_per_query) { @@ -95,8 +94,7 @@ std::vector aggregate_and_sort_times_per_query( } } else if (aggregation_type == AggregationType::Max) { for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end()) - ); + aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end())); } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); @@ -405,44 +403,50 @@ int main(int argc, const char** argv) { ); app.add_option("--runs", runs, "Number of runs per query")->default_val(3)->check(CLI::PositiveNumber); app.add_option("--aggregate-by", aggregate_by, "Aggregation mode for results per query") - ->transform(CLI::CheckedTransformer(std::map{ - {"none", AggregationType::None}, - {"min", AggregationType::Min}, - {"mean", AggregationType::Mean}, - {"median", AggregationType::Median}, - {"max", AggregationType::Max}, - })) + ->transform( + CLI::CheckedTransformer( + std::map{ + {"none", AggregationType::None}, + {"min", AggregationType::Min}, + {"mean", AggregationType::Mean}, + {"median", AggregationType::Median}, + {"max", AggregationType::Max}, + } + ) + ) ->default_val("none"); CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); spdlog::set_level(app.log_level()); - run_for_index(app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { - using Index = std::decay_t; - auto params = std::make_tuple( - &index, - app.wand_data_path(), - app.queries(), - app.thresholds_file(), - app.index_encoding(), - app.algorithm(), - app.k(), - app.scorer_params(), - app.weighted(), - safe, - runs, - aggregate_by, - summary_only - ); - if (app.is_wand_compressed()) { - if (quantized) { - std::apply(perftest, params); + run_for_index( + app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { + using Index = std::decay_t; + auto params = std::make_tuple( + &index, + app.wand_data_path(), + app.queries(), + app.thresholds_file(), + app.index_encoding(), + app.algorithm(), + app.k(), + app.scorer_params(), + app.weighted(), + safe, + runs, + aggregate_by, + summary_only + ); + if (app.is_wand_compressed()) { + if (quantized) { + std::apply(perftest, params); + } else { + std::apply(perftest, params); + } } else { - std::apply(perftest, params); + std::apply(perftest, params); } - } else { - std::apply(perftest, params); } - }); + ); } From decb5f84c7e397eec29b456bf8bb271a1fe2e1d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 25 Dec 2025 23:38:41 -0300 Subject: [PATCH 21/55] Remove `--aggregate-by` option, printing all by default --- tools/queries.cpp | 72 +++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 53 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index a118c488..f5a37fae 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -121,7 +121,6 @@ void extract_times( size_t runs, std::uint64_t k, bool safe, - AggregationType aggregate_by, bool summary_only, std::ostream& os ) { @@ -155,44 +154,27 @@ void extract_times( spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); spdlog::info("Runs per query (excluding warmup): {}", runs); - if (aggregate_by == AggregationType::None) { - auto print_aggregated_stats = [&](AggregationType type) { - print_stats(type, aggregate_and_sort_times_per_query(type, times_per_query)); - }; - print_aggregated_stats(AggregationType::None); - print_aggregated_stats(AggregationType::Min); - print_aggregated_stats(AggregationType::Mean); - print_aggregated_stats(AggregationType::Median); - print_aggregated_stats(AggregationType::Max); + auto print_aggregated_stats = [&](AggregationType type) { + print_stats(type, aggregate_and_sort_times_per_query(type, times_per_query)); + }; + print_aggregated_stats(AggregationType::None); + print_aggregated_stats(AggregationType::Min); + print_aggregated_stats(AggregationType::Mean); + print_aggregated_stats(AggregationType::Median); + print_aggregated_stats(AggregationType::Max); - if (!summary_only) { - std::cout << "qid"; - for (size_t i = 1; i <= runs; ++i) { - std::cout << fmt::format("\tusec{}", i); - } - std::cout << "\n"; - for (auto&& [query_idx, query]: enumerate(queries)) { - os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); - for (auto t: times_per_query[query_idx]) { - os << fmt::format("\t{}", t); - } - os << "\n"; - } + if (!summary_only) { + std::cout << "qid"; + for (size_t i = 1; i <= runs; ++i) { + std::cout << fmt::format("\tusec{}", i); } - } else { - auto aggregated_query_times = - aggregate_and_sort_times_per_query(aggregate_by, times_per_query); - print_stats(aggregate_by, aggregated_query_times); - - if (!summary_only) { - std::cout << fmt::format("qid\tusec_{}", to_string(aggregate_by)) << "\n"; - for (auto&& [query_idx, query]: enumerate(queries)) { - os << fmt::format( - "{}\t{}\n", - query.id().value_or(std::to_string(query_idx)), - aggregated_query_times[query_idx] - ); + std::cout << "\n"; + for (auto&& [query_idx, query]: enumerate(queries)) { + os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); + for (auto t: times_per_query[query_idx]) { + os << fmt::format("\t{}", t); } + os << "\n"; } } } @@ -210,7 +192,6 @@ void perftest( const bool weighted, bool safe, std::size_t runs, - AggregationType aggregate_by, bool summary_only ) { auto const& index = *index_ptr; @@ -371,7 +352,7 @@ void perftest( break; } extract_times( - query_fun, queries, thresholds, type, t, runs, k, safe, aggregate_by, summary_only, std::cout + query_fun, queries, thresholds, type, t, runs, k, safe, summary_only, std::cout ); } } @@ -385,7 +366,6 @@ int main(int argc, const char** argv) { bool quantized = false; bool summary_only = false; std::size_t runs = 0; - AggregationType aggregate_by = AggregationType::None; App, @@ -402,19 +382,6 @@ int main(int argc, const char** argv) { "--summary-only", summary_only, "Only print summary stats, ommiting per-query results" ); app.add_option("--runs", runs, "Number of runs per query")->default_val(3)->check(CLI::PositiveNumber); - app.add_option("--aggregate-by", aggregate_by, "Aggregation mode for results per query") - ->transform( - CLI::CheckedTransformer( - std::map{ - {"none", AggregationType::None}, - {"min", AggregationType::Min}, - {"mean", AggregationType::Mean}, - {"median", AggregationType::Median}, - {"max", AggregationType::Max}, - } - ) - ) - ->default_val("none"); CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); @@ -435,7 +402,6 @@ int main(int argc, const char** argv) { app.weighted(), safe, runs, - aggregate_by, summary_only ); if (app.is_wand_compressed()) { From 852c619832b7c339112c9e754e578bff8787a873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 25 Dec 2025 23:44:15 -0300 Subject: [PATCH 22/55] Remove `--summary-only` option --- tools/queries.cpp | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index f5a37fae..3b0d46f6 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -121,7 +121,6 @@ void extract_times( size_t runs, std::uint64_t k, bool safe, - bool summary_only, std::ostream& os ) { std::vector> times_per_query( @@ -163,19 +162,17 @@ void extract_times( print_aggregated_stats(AggregationType::Median); print_aggregated_stats(AggregationType::Max); - if (!summary_only) { - std::cout << "qid"; - for (size_t i = 1; i <= runs; ++i) { - std::cout << fmt::format("\tusec{}", i); - } - std::cout << "\n"; - for (auto&& [query_idx, query]: enumerate(queries)) { - os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); - for (auto t: times_per_query[query_idx]) { - os << fmt::format("\t{}", t); - } - os << "\n"; + std::cout << "qid"; + for (size_t i = 1; i <= runs; ++i) { + std::cout << fmt::format("\tusec{}", i); + } + std::cout << "\n"; + for (auto&& [query_idx, query]: enumerate(queries)) { + os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); + for (auto t: times_per_query[query_idx]) { + os << fmt::format("\t{}", t); } + os << "\n"; } } @@ -191,8 +188,7 @@ void perftest( const ScorerParams& scorer_params, const bool weighted, bool safe, - std::size_t runs, - bool summary_only + std::size_t runs ) { auto const& index = *index_ptr; @@ -352,7 +348,7 @@ void perftest( break; } extract_times( - query_fun, queries, thresholds, type, t, runs, k, safe, summary_only, std::cout + query_fun, queries, thresholds, type, t, runs, k, safe, std::cout ); } } @@ -364,7 +360,6 @@ using wand_uniform_index_quantized = wand_dataneeds(app.thresholds_option()); - app.add_flag( - "--summary-only", summary_only, "Only print summary stats, ommiting per-query results" - ); app.add_option("--runs", runs, "Number of runs per query")->default_val(3)->check(CLI::PositiveNumber); CLI11_PARSE(app, argc, argv); @@ -401,8 +393,7 @@ int main(int argc, const char** argv) { app.scorer_params(), app.weighted(), safe, - runs, - summary_only + runs ); if (app.is_wand_compressed()) { if (quantized) { From 5077c6774e5c535322f43559e9259b2179ab021f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 26 Dec 2025 00:15:43 -0300 Subject: [PATCH 23/55] Add `--output` option, for query timing data --- tools/queries.cpp | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 3b0d46f6..11a162c7 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -121,7 +122,7 @@ void extract_times( size_t runs, std::uint64_t k, bool safe, - std::ostream& os + std::ostream* os = nullptr ) { std::vector> times_per_query( queries.size(), std::vector(runs) @@ -162,17 +163,19 @@ void extract_times( print_aggregated_stats(AggregationType::Median); print_aggregated_stats(AggregationType::Max); - std::cout << "qid"; - for (size_t i = 1; i <= runs; ++i) { - std::cout << fmt::format("\tusec{}", i); - } - std::cout << "\n"; - for (auto&& [query_idx, query]: enumerate(queries)) { - os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); - for (auto t: times_per_query[query_idx]) { - os << fmt::format("\t{}", t); + if (os != nullptr) { + *os << "qid"; + for (size_t i = 1; i <= runs; ++i) { + *os << fmt::format("\tusec{}", i); + } + *os << "\n"; + for (auto&& [query_idx, query]: enumerate(queries)) { + *os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); + for (auto t: times_per_query[query_idx]) { + *os << fmt::format("\t{}", t); + } + *os << "\n"; } - os << "\n"; } } @@ -188,7 +191,8 @@ void perftest( const ScorerParams& scorer_params, const bool weighted, bool safe, - std::size_t runs + std::size_t runs, + std::ostream* output ) { auto const& index = *index_ptr; @@ -348,7 +352,7 @@ void perftest( break; } extract_times( - query_fun, queries, thresholds, type, t, runs, k, safe, std::cout + query_fun, queries, thresholds, type, t, runs, k, safe, output ); } } @@ -361,6 +365,7 @@ int main(int argc, const char** argv) { bool safe = false; bool quantized = false; std::size_t runs = 0; + std::optional output_filename; App, @@ -374,11 +379,23 @@ int main(int argc, const char** argv) { app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); app.add_option("--runs", runs, "Number of runs per query")->default_val(3)->check(CLI::PositiveNumber); + app.add_option("-o,--output", output_filename, "Output file for query timing data"); CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); spdlog::set_level(app.log_level()); + std::ofstream output_file; + std::ostream* output = nullptr; + if (output_filename) { + output_file.open(*output_filename); + if (!output_file) { + spdlog::error("Failed to open data output file: {}", *output_filename); + return 1; + } + output = &output_file; + } + run_for_index( app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { using Index = std::decay_t; @@ -393,7 +410,8 @@ int main(int argc, const char** argv) { app.scorer_params(), app.weighted(), safe, - runs + runs, + output ); if (app.is_wand_compressed()) { if (quantized) { From a709b671dd342b60605d2a268577d575eb45f91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 26 Dec 2025 16:42:55 -0300 Subject: [PATCH 24/55] Improve JSON output and logs --- tools/queries.cpp | 67 +++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 11a162c7..2559ea1c 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -102,16 +102,6 @@ std::vector aggregate_and_sort_times_per_query( return aggregated_query_times; } -void print_stats(AggregationType aggregation_type, std::vector const& query_times) { - double mean = - std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); - double q50 = query_times[query_times.size() / 2]; - double q90 = query_times[90 * query_times.size() / 100]; - double q95 = query_times[95 * query_times.size() / 100]; - double q99 = query_times[99 * query_times.size() / 100]; - stats_line(std::cerr)("aggregated_by", to_string(aggregation_type))("mean", mean)("q50", q50)("q90", q90)("q95", q95)("q99", q99); -} - template void extract_times( Fn query_func, @@ -128,7 +118,6 @@ void extract_times( queries.size(), std::vector(runs) ); std::size_t corrective_rerun_count = 0; - spdlog::info("Safe: {}", safe); // Note: each query is measured once per run, so the set of queries is // measured independently in each run. @@ -150,19 +139,46 @@ void extract_times( } } - spdlog::info("---- {} {}", index_type, query_type); - spdlog::info("Corrective reruns due to insufficient results: {}", corrective_rerun_count); - spdlog::info("Runs per query (excluding warmup): {}", runs); + // Print JSON summary + std::cout << "{\n" + << " \"encoding\": \"" << index_type << "\",\n" + << " \"algorithm\": \"" << query_type << "\",\n" + << " \"runs\": " << runs << ",\n" + << " \"k\": " << k << ",\n" + << " \"safe\": " << (safe ? "true" : "false") << ",\n" + << " \"corrective_reruns\": " << corrective_rerun_count << ",\n" + << " \"query_aggregation\": {\n"; + + auto print_aggregated_query_times = [&](AggregationType agg_type, bool is_last=false) { + auto query_times = aggregate_and_sort_times_per_query(agg_type, times_per_query); + auto agg_name = to_string(agg_type); - auto print_aggregated_stats = [&](AggregationType type) { - print_stats(type, aggregate_and_sort_times_per_query(type, times_per_query)); + double mean = + std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); + double q50 = query_times[query_times.size() / 2]; + double q90 = query_times[90 * query_times.size() / 100]; + double q95 = query_times[95 * query_times.size() / 100]; + double q99 = query_times[99 * query_times.size() / 100]; + + std::cout << " \"" << agg_name << "\": {" + << "\"mean\": " << mean << ", " + << "\"q50\": " << q50 << ", " + << "\"q90\": " << q90 << ", " + << "\"q95\": " << q95 << ", " + << "\"q99\": " << q99 << "}"; + if (!is_last) { + std::cout << ",\n"; + } }; - print_aggregated_stats(AggregationType::None); - print_aggregated_stats(AggregationType::Min); - print_aggregated_stats(AggregationType::Mean); - print_aggregated_stats(AggregationType::Median); - print_aggregated_stats(AggregationType::Max); + print_aggregated_query_times(AggregationType::None); + print_aggregated_query_times(AggregationType::Min); + print_aggregated_query_times(AggregationType::Mean); + print_aggregated_query_times(AggregationType::Median); + print_aggregated_query_times(AggregationType::Max, true); + std::cout << "\n }\n}\n"; + + // Save times per query (if required) if (os != nullptr) { *os << "qid"; for (size_t i = 1; i <= runs; ++i) { @@ -195,8 +211,7 @@ void perftest( std::ostream* output ) { auto const& index = *index_ptr; - - spdlog::info("Warming up posting lists"); + spdlog::info("Warming up posting lists..."); std::unordered_set warmed_up; for (auto const& q: queries) { for (auto [t, _]: q.terms()) { @@ -229,15 +244,11 @@ void perftest( } auto scorer = scorer::from_params(scorer_params, wdata); - - spdlog::info("Performing {} queries", type); - spdlog::info("K: {}", k); - std::vector query_types; boost::algorithm::split(query_types, query_type, boost::is_any_of(":")); for (auto&& t: query_types) { - spdlog::info("Query type: {}", t); + spdlog::info("Performing {} runs for '{}' queries...", runs, t); std::function query_fun; if (t == "and") { query_fun = [&](Query query, Score) { From da11b0065b71b4f4f5c8fe922a98f49e776fbe5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 26 Dec 2025 16:53:36 -0300 Subject: [PATCH 25/55] Use `std::logic_error` instead of just `return` in `to_string` --- tools/queries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 2559ea1c..89b3bbcd 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -55,7 +55,7 @@ enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; case AggregationType::Median: return "median"; case AggregationType::Max: return "max"; } - return "unknown"; + throw std::logic_error("Unknown AggregationType"); } std::vector aggregate_and_sort_times_per_query( From b866216c3f897951a7045b8653db3235778fce42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 26 Dec 2025 18:24:43 -0300 Subject: [PATCH 26/55] Add support for specifying multiple output files (one per specified query type) --- tools/queries.cpp | 63 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 89b3bbcd..768297a7 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -202,13 +202,13 @@ void perftest( const std::vector& queries, const std::optional& thresholds_filename, std::string const& type, - std::string const& query_type, + std::vector const& query_types, uint64_t k, const ScorerParams& scorer_params, const bool weighted, bool safe, std::size_t runs, - std::ostream* output + std::optional> const& output_paths ) { auto const& index = *index_ptr; spdlog::info("Warming up posting lists..."); @@ -244,11 +244,29 @@ void perftest( } auto scorer = scorer::from_params(scorer_params, wdata); - std::vector query_types; - boost::algorithm::split(query_types, query_type, boost::is_any_of(":")); - for (auto&& t: query_types) { + std::vector output_files(output_paths ? output_paths->size() : 0); + if (output_paths) { + for (std::size_t i = 0; i < output_paths->size(); ++i) { + const auto& path = (*output_paths)[i]; + output_files[i].open(path); + if (!output_files[i].is_open()) { + const auto err_msg = fmt::format("Failed to open output file: {}.", path); + spdlog::error(err_msg); + throw std::runtime_error(err_msg); + } + } + } + + for (std::size_t query_type_idx = 0; query_type_idx < query_types.size(); ++query_type_idx) { + auto const& t = query_types[query_type_idx]; + std::ostream* output = nullptr; + if (output_paths) { + output = &output_files[query_type_idx]; + spdlog::info("Per-run query output will be saved to '{}'.", (*output_paths)[query_type_idx]); + } spdlog::info("Performing {} runs for '{}' queries...", runs, t); + std::function query_fun; if (t == "and") { query_fun = [&](Query query, Score) { @@ -376,7 +394,7 @@ int main(int argc, const char** argv) { bool safe = false; bool quantized = false; std::size_t runs = 0; - std::optional output_filename; + std::optional output_paths_arg; App, @@ -390,21 +408,34 @@ int main(int argc, const char** argv) { app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); app.add_option("--runs", runs, "Number of runs per query")->default_val(3)->check(CLI::PositiveNumber); - app.add_option("-o,--output", output_filename, "Output file for query timing data"); + app.add_option( + "-o,--output", + output_paths_arg, + "Output file for per-run query timing data (use ':' to separate multiple files)" + ); CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); spdlog::set_level(app.log_level()); - std::ofstream output_file; - std::ostream* output = nullptr; - if (output_filename) { - output_file.open(*output_filename); - if (!output_file) { - spdlog::error("Failed to open data output file: {}", *output_filename); + // Parse query types (algorithms) + std::vector query_types; + boost::algorithm::split(query_types, app.algorithm(), boost::is_any_of(":")); + + // Parse file paths + std::optional> output_paths; + if (output_paths_arg) { + std::vector outputs; + boost::algorithm::split(outputs, *output_paths_arg, boost::is_any_of(":")); + if (outputs.size() != query_types.size()) { + spdlog::error( + "Expected {} output files (one per query type) but got {}.", + outputs.size(), + query_types.size() + ); return 1; } - output = &output_file; + output_paths = std::move(outputs); } run_for_index( @@ -416,13 +447,13 @@ int main(int argc, const char** argv) { app.queries(), app.thresholds_file(), app.index_encoding(), - app.algorithm(), + query_types, app.k(), app.scorer_params(), app.weighted(), safe, runs, - output + output_paths ); if (app.is_wand_compressed()) { if (quantized) { From 9ef2849f0e889615426eaf6fc1885dda0dc335c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 26 Dec 2025 18:26:45 -0300 Subject: [PATCH 27/55] Add clarification for query type (algorithm) parameter --- tools/app.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/app.cpp b/tools/app.cpp index dc2ca310..e12b07a6 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -87,7 +87,12 @@ const std::map LogLevel::ENUM_MAP = { }; Algorithm::Algorithm(CLI::App* app) { - app->add_option("-a,--algorithm", m_algorithm, "Query processing algorithm")->required(); + app->add_option( + "-a,--algorithm", + m_algorithm, + "Query processing algorithm (use ':' to separate multiple algorithms)" + ) + ->required(); } auto Algorithm::algorithm() const -> std::string const& { From 5c0fceba647039c0607c1569353d471b5a1bb7d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 26 Dec 2025 23:58:36 -0300 Subject: [PATCH 28/55] Modify query timing output to use a single file with 'algorithm' column --- tools/queries.cpp | 77 +++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 49 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 768297a7..f1a1a602 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -180,13 +180,12 @@ void extract_times( // Save times per query (if required) if (os != nullptr) { - *os << "qid"; - for (size_t i = 1; i <= runs; ++i) { - *os << fmt::format("\tusec{}", i); - } - *os << "\n"; for (auto&& [query_idx, query]: enumerate(queries)) { - *os << fmt::format("{}", query.id().value_or(std::to_string(query_idx))); + *os << fmt::format( + "{}\t{}", + query_type, + query.id().value_or(std::to_string(query_idx)) + ); for (auto t: times_per_query[query_idx]) { *os << fmt::format("\t{}", t); } @@ -208,7 +207,7 @@ void perftest( const bool weighted, bool safe, std::size_t runs, - std::optional> const& output_paths + std::optional const& output_path ) { auto const& index = *index_ptr; spdlog::info("Warming up posting lists..."); @@ -245,26 +244,28 @@ void perftest( auto scorer = scorer::from_params(scorer_params, wdata); - std::vector output_files(output_paths ? output_paths->size() : 0); - if (output_paths) { - for (std::size_t i = 0; i < output_paths->size(); ++i) { - const auto& path = (*output_paths)[i]; - output_files[i].open(path); - if (!output_files[i].is_open()) { - const auto err_msg = fmt::format("Failed to open output file: {}.", path); - spdlog::error(err_msg); - throw std::runtime_error(err_msg); - } + std::ofstream output_file; + std::ostream* output = nullptr; + if (output_path) { + output_file.open(*output_path); + if (!output_file.is_open()) { + const auto err_msg = fmt::format("Failed to open output file: {}.", *output_path); + spdlog::error(err_msg); + throw std::runtime_error(err_msg); } - } + output = &output_file; + + // Add header + output_file << "algorithm\tqid"; + for (size_t i = 1; i <= runs; ++i) { + output_file << fmt::format("\tusec{}", i); + } + output_file << "\n"; + spdlog::info("Per-run query output will be saved to '{}'.", *output_path); + } for (std::size_t query_type_idx = 0; query_type_idx < query_types.size(); ++query_type_idx) { auto const& t = query_types[query_type_idx]; - std::ostream* output = nullptr; - if (output_paths) { - output = &output_files[query_type_idx]; - spdlog::info("Per-run query output will be saved to '{}'.", (*output_paths)[query_type_idx]); - } spdlog::info("Performing {} runs for '{}' queries...", runs, t); std::function query_fun; @@ -380,9 +381,7 @@ void perftest( spdlog::error("Unsupported query type: {}", t); break; } - extract_times( - query_fun, queries, thresholds, type, t, runs, k, safe, output - ); + extract_times(query_fun, queries, thresholds, type, t, runs, k, safe, output); } } @@ -394,7 +393,7 @@ int main(int argc, const char** argv) { bool safe = false; bool quantized = false; std::size_t runs = 0; - std::optional output_paths_arg; + std::optional output_path; App, @@ -408,11 +407,7 @@ int main(int argc, const char** argv) { app.add_flag("--safe", safe, "Rerun if not enough results with pruning.") ->needs(app.thresholds_option()); app.add_option("--runs", runs, "Number of runs per query")->default_val(3)->check(CLI::PositiveNumber); - app.add_option( - "-o,--output", - output_paths_arg, - "Output file for per-run query timing data (use ':' to separate multiple files)" - ); + app.add_option("-o,--output", output_path, "Output file for per-run query timing data"); CLI11_PARSE(app, argc, argv); spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); @@ -422,22 +417,6 @@ int main(int argc, const char** argv) { std::vector query_types; boost::algorithm::split(query_types, app.algorithm(), boost::is_any_of(":")); - // Parse file paths - std::optional> output_paths; - if (output_paths_arg) { - std::vector outputs; - boost::algorithm::split(outputs, *output_paths_arg, boost::is_any_of(":")); - if (outputs.size() != query_types.size()) { - spdlog::error( - "Expected {} output files (one per query type) but got {}.", - outputs.size(), - query_types.size() - ); - return 1; - } - output_paths = std::move(outputs); - } - run_for_index( app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { using Index = std::decay_t; @@ -453,7 +432,7 @@ int main(int argc, const char** argv) { app.weighted(), safe, runs, - output_paths + output_path ); if (app.is_wand_compressed()) { if (quantized) { From 5b14aecf81a52d6178dc31a40e5e98ebe1b9d91e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Sat, 27 Dec 2025 00:44:26 -0300 Subject: [PATCH 29/55] Update JSON output and use `nlohmann/json.hpp` library --- tools/CMakeLists.txt | 2 +- tools/queries.cpp | 50 ++++++++++++++++++++++---------------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 071f7077..6a6a2229 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -5,7 +5,7 @@ function(ADD_TOOL TOOL_NAME SRC_FILE) endfunction() add_library(app app.cpp) -target_link_libraries(app pisa CLI11) +target_link_libraries(app pisa CLI11 nlohmann_json::nlohmann_json) add_tool(compress_inverted_index compress_inverted_index.cpp) add_tool(create_wand_data create_wand_data.cpp) diff --git a/tools/queries.cpp b/tools/queries.cpp index f1a1a602..537e5ec4 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -5,6 +5,8 @@ #include #include +#include + #include #include #include @@ -44,7 +46,6 @@ using namespace pisa; using ranges::views::enumerate; - enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; [[nodiscard]] auto to_string(AggregationType type) -> std::string { @@ -140,16 +141,16 @@ void extract_times( } // Print JSON summary - std::cout << "{\n" - << " \"encoding\": \"" << index_type << "\",\n" - << " \"algorithm\": \"" << query_type << "\",\n" - << " \"runs\": " << runs << ",\n" - << " \"k\": " << k << ",\n" - << " \"safe\": " << (safe ? "true" : "false") << ",\n" - << " \"corrective_reruns\": " << corrective_rerun_count << ",\n" - << " \"query_aggregation\": {\n"; + nlohmann::json summary; + summary["encoding"] = index_type; + summary["algorithm"] = query_type; + summary["runs"] = runs; + summary["k"] = k; + summary["safe"] = safe; + summary["corrective_reruns"] = corrective_rerun_count; + summary["times"] = nlohmann::json::array(); - auto print_aggregated_query_times = [&](AggregationType agg_type, bool is_last=false) { + auto add_aggregated_query_times = [&](AggregationType agg_type) { auto query_times = aggregate_and_sort_times_per_query(agg_type, times_per_query); auto agg_name = to_string(agg_type); @@ -160,23 +161,22 @@ void extract_times( double q95 = query_times[95 * query_times.size() / 100]; double q99 = query_times[99 * query_times.size() / 100]; - std::cout << " \"" << agg_name << "\": {" - << "\"mean\": " << mean << ", " - << "\"q50\": " << q50 << ", " - << "\"q90\": " << q90 << ", " - << "\"q95\": " << q95 << ", " - << "\"q99\": " << q99 << "}"; - if (!is_last) { - std::cout << ",\n"; - } + summary["times"].push_back({ + {"query_aggregation", agg_name}, + {"mean", mean}, + {"q50", q50}, + {"q90", q90}, + {"q95", q95}, + {"q99", q99} + }); }; - print_aggregated_query_times(AggregationType::None); - print_aggregated_query_times(AggregationType::Min); - print_aggregated_query_times(AggregationType::Mean); - print_aggregated_query_times(AggregationType::Median); - print_aggregated_query_times(AggregationType::Max, true); - std::cout << "\n }\n}\n"; + add_aggregated_query_times(AggregationType::None); + add_aggregated_query_times(AggregationType::Min); + add_aggregated_query_times(AggregationType::Mean); + add_aggregated_query_times(AggregationType::Median); + add_aggregated_query_times(AggregationType::Max); + std::cout << summary.dump(2) << "\n"; // Save times per query (if required) if (os != nullptr) { From 4a5845510541480bfe081ce30fdec33c8c426305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Sat, 27 Dec 2025 01:45:36 -0300 Subject: [PATCH 30/55] Format `queries.cpp` --- tools/queries.cpp | 76 ++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 537e5ec4..376ac8b6 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -71,7 +71,8 @@ std::vector aggregate_and_sort_times_per_query( } } else if (aggregation_type == AggregationType::Min) { for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end())); + aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end()) + ); } } else if (aggregation_type == AggregationType::Mean) { for (auto const& query_times: times_per_query) { @@ -96,7 +97,8 @@ std::vector aggregate_and_sort_times_per_query( } } else if (aggregation_type == AggregationType::Max) { for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end())); + aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end()) + ); } } std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); @@ -161,14 +163,14 @@ void extract_times( double q95 = query_times[95 * query_times.size() / 100]; double q99 = query_times[99 * query_times.size() / 100]; - summary["times"].push_back({ - {"query_aggregation", agg_name}, - {"mean", mean}, - {"q50", q50}, - {"q90", q90}, - {"q95", q95}, - {"q99", q99} - }); + summary["times"].push_back( + {{"query_aggregation", agg_name}, + {"mean", mean}, + {"q50", q50}, + {"q90", q90}, + {"q95", q95}, + {"q99", q99}} + ); }; add_aggregated_query_times(AggregationType::None); @@ -181,11 +183,7 @@ void extract_times( // Save times per query (if required) if (os != nullptr) { for (auto&& [query_idx, query]: enumerate(queries)) { - *os << fmt::format( - "{}\t{}", - query_type, - query.id().value_or(std::to_string(query_idx)) - ); + *os << fmt::format("{}\t{}", query_type, query.id().value_or(std::to_string(query_idx))); for (auto t: times_per_query[query_idx]) { *os << fmt::format("\t{}", t); } @@ -417,32 +415,30 @@ int main(int argc, const char** argv) { std::vector query_types; boost::algorithm::split(query_types, app.algorithm(), boost::is_any_of(":")); - run_for_index( - app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { - using Index = std::decay_t; - auto params = std::make_tuple( - &index, - app.wand_data_path(), - app.queries(), - app.thresholds_file(), - app.index_encoding(), - query_types, - app.k(), - app.scorer_params(), - app.weighted(), - safe, - runs, - output_path - ); - if (app.is_wand_compressed()) { - if (quantized) { - std::apply(perftest, params); - } else { - std::apply(perftest, params); - } + run_for_index(app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { + using Index = std::decay_t; + auto params = std::make_tuple( + &index, + app.wand_data_path(), + app.queries(), + app.thresholds_file(), + app.index_encoding(), + query_types, + app.k(), + app.scorer_params(), + app.weighted(), + safe, + runs, + output_path + ); + if (app.is_wand_compressed()) { + if (quantized) { + std::apply(perftest, params); } else { - std::apply(perftest, params); + std::apply(perftest, params); } + } else { + std::apply(perftest, params); } - ); + }); } From 5c702fdc036eecc907d961ea747053bd5c3d3036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 2 Jan 2026 17:21:47 -0300 Subject: [PATCH 31/55] Add license comment --- tools/queries.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/queries.cpp b/tools/queries.cpp index 376ac8b6..fe749065 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -1,3 +1,17 @@ +// Copyright 2025 PISA Developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include #include #include From c347fb365d18c4ea32c7f3b3d34dd08e20c39609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 2 Jan 2026 17:28:22 -0300 Subject: [PATCH 32/55] Reorder `` include directive --- tools/queries.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index fe749065..c4945b84 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -19,11 +19,10 @@ #include #include -#include - #include #include #include +#include #include #include #include From 2722a47a64af7b2a7dfe3f6f8cf92bc9d0b8e7b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 2 Jan 2026 17:54:25 -0300 Subject: [PATCH 33/55] Refactor query loop using `enumerate` --- tools/queries.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index c4945b84..9e1bc4b9 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -138,8 +138,7 @@ void extract_times( // Note: each query is measured once per run, so the set of queries is // measured independently in each run. for (size_t run = 0; run <= runs; ++run) { - size_t query_idx = 0; - for (auto const& query: queries) { + for (auto&& [query_idx, query]: enumerate(queries)) { auto usecs = run_with_timer([&]() { uint64_t result = query_func(query, thresholds[query_idx]); if (safe && result < k) { @@ -151,7 +150,6 @@ void extract_times( if (run != 0) { // first run is not timed times_per_query[query_idx][run - 1] = usecs.count(); } - query_idx += 1; } } From a9028dc2375e6623af37001c0885c2c1ba287b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Mon, 5 Jan 2026 16:48:53 -0300 Subject: [PATCH 34/55] Refactor query times structs --- tools/queries.cpp | 71 ++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 9e1bc4b9..2c11a499 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -61,6 +61,7 @@ using namespace pisa; using ranges::views::enumerate; enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; + [[nodiscard]] auto to_string(AggregationType type) -> std::string { switch (type) { case AggregationType::None: return "none"; @@ -72,9 +73,28 @@ enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; throw std::logic_error("Unknown AggregationType"); } -std::vector aggregate_and_sort_times_per_query( +struct QueryTimesSummary { + AggregationType aggregation_type; + double mean; + double q50; + double q90; + double q95; + double q99; + + [[nodiscard]] auto to_json() const -> nlohmann::json + { + return {{"query_aggregation", to_string(aggregation_type)}, + {"mean", mean}, + {"q50", q50}, + {"q90", q90}, + {"q95", q95}, + {"q99", q99}}; + } +}; + +auto aggregate_query_times( AggregationType aggregation_type, std::vector> const& times_per_query -) { +) -> std::vector { std::vector aggregated_query_times; if (aggregation_type == AggregationType::None) { for (auto const& query_times: times_per_query) { @@ -118,6 +138,22 @@ std::vector aggregate_and_sort_times_per_query( return aggregated_query_times; } +auto summarize( + std::vector> const& times_per_query, AggregationType agg_type +) -> QueryTimesSummary +{ + auto query_times = aggregate_query_times(agg_type, times_per_query); + + double mean = + std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); + double q50 = query_times[query_times.size() / 2]; + double q90 = query_times[90 * query_times.size() / 100]; + double q95 = query_times[95 * query_times.size() / 100]; + double q99 = query_times[99 * query_times.size() / 100]; + + return {agg_type, mean, q50, q90, q95, q99}; +} + template void extract_times( Fn query_func, @@ -163,32 +199,11 @@ void extract_times( summary["corrective_reruns"] = corrective_rerun_count; summary["times"] = nlohmann::json::array(); - auto add_aggregated_query_times = [&](AggregationType agg_type) { - auto query_times = aggregate_and_sort_times_per_query(agg_type, times_per_query); - auto agg_name = to_string(agg_type); - - double mean = - std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); - double q50 = query_times[query_times.size() / 2]; - double q90 = query_times[90 * query_times.size() / 100]; - double q95 = query_times[95 * query_times.size() / 100]; - double q99 = query_times[99 * query_times.size() / 100]; - - summary["times"].push_back( - {{"query_aggregation", agg_name}, - {"mean", mean}, - {"q50", q50}, - {"q90", q90}, - {"q95", q95}, - {"q99", q99}} - ); - }; - - add_aggregated_query_times(AggregationType::None); - add_aggregated_query_times(AggregationType::Min); - add_aggregated_query_times(AggregationType::Mean); - add_aggregated_query_times(AggregationType::Median); - add_aggregated_query_times(AggregationType::Max); + summary["times"].push_back(summarize(times_per_query, AggregationType::None).to_json()); + summary["times"].push_back(summarize(times_per_query, AggregationType::Min).to_json()); + summary["times"].push_back(summarize(times_per_query, AggregationType::Mean).to_json()); + summary["times"].push_back(summarize(times_per_query, AggregationType::Median).to_json()); + summary["times"].push_back(summarize(times_per_query, AggregationType::Max).to_json()); std::cout << summary.dump(2) << "\n"; // Save times per query (if required) From aa0bcede1a3958a01a779e4c3ef32720b579c8ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Mon, 5 Jan 2026 17:06:28 -0300 Subject: [PATCH 35/55] Change query timing output to 'long' format --- tools/queries.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 2c11a499..5aa04368 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -209,11 +209,15 @@ void extract_times( // Save times per query (if required) if (os != nullptr) { for (auto&& [query_idx, query]: enumerate(queries)) { - *os << fmt::format("{}\t{}", query_type, query.id().value_or(std::to_string(query_idx))); - for (auto t: times_per_query[query_idx]) { - *os << fmt::format("\t{}", t); + for (auto&& [run_idx, time]: enumerate(times_per_query[query_idx])) { + *os << fmt::format( + "{}\t{}\t{}\t{}\n", + query_type, + query.id().value_or(std::to_string(query_idx)), + run_idx + 1, + time + ); } - *os << "\n"; } } } @@ -280,11 +284,7 @@ void perftest( output = &output_file; // Add header - output_file << "algorithm\tqid"; - for (size_t i = 1; i <= runs; ++i) { - output_file << fmt::format("\tusec{}", i); - } - output_file << "\n"; + output_file << "algorithm\tqid\trun\tusec\n"; spdlog::info("Per-run query output will be saved to '{}'.", *output_path); } From 17780d64ba65324462888e927472ff4dd2a7f881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Mon, 5 Jan 2026 18:38:36 -0300 Subject: [PATCH 36/55] Refactor query timing logic into structs --- tools/queries.cpp | 198 ++++++++++++++++++++++++++-------------------- 1 file changed, 112 insertions(+), 86 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 5aa04368..e2a8285f 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -92,84 +92,82 @@ struct QueryTimesSummary { } }; -auto aggregate_query_times( - AggregationType aggregation_type, std::vector> const& times_per_query -) -> std::vector { - std::vector aggregated_query_times; - if (aggregation_type == AggregationType::None) { - for (auto const& query_times: times_per_query) { - for (auto t: query_times) { - aggregated_query_times.push_back(t); +struct QueryTimes { + std::vector> values; + std::size_t corrective_rerun_count; + + auto aggregate(AggregationType aggregation_type) const -> std::vector + { + std::vector aggregated_query_times; + if (aggregation_type == AggregationType::None) { + for (auto const& times_per_run: values) { + for (auto t: times_per_run) { + aggregated_query_times.push_back(t); + } } - } - } else if (aggregation_type == AggregationType::Min) { - for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::min_element(query_times.begin(), query_times.end()) - ); - } - } else if (aggregation_type == AggregationType::Mean) { - for (auto const& query_times: times_per_query) { - double sum = std::accumulate(query_times.begin(), query_times.end(), double()); - double mean = sum / query_times.size(); - aggregated_query_times.push_back(mean); - } - } else if (aggregation_type == AggregationType::Median) { - for (auto const& query_times: times_per_query) { - auto sorted_query_times = query_times; - std::sort(sorted_query_times.begin(), sorted_query_times.end()); - std::size_t sample_count = sorted_query_times.size(); - double median = 0; - if (sample_count % 2 == 1) { - median = sorted_query_times[sample_count / 2]; - } else { - median = - (sorted_query_times[sample_count / 2] + sorted_query_times[sample_count / 2 - 1]) - / 2; + } else if (aggregation_type == AggregationType::Min) { + for (auto const& times_per_run: values) { + aggregated_query_times.push_back(*std::min_element(times_per_run.begin(), times_per_run.end()) + ); + } + } else if (aggregation_type == AggregationType::Mean) { + for (auto const& times_per_run: values) { + double sum = std::accumulate(times_per_run.begin(), times_per_run.end(), double()); + double mean = sum / times_per_run.size(); + aggregated_query_times.push_back(mean); + } + } else if (aggregation_type == AggregationType::Median) { + for (auto const& times_per_run: values) { + auto sorted_times = times_per_run; + std::sort(sorted_times.begin(), sorted_times.end()); + std::size_t sample_count = sorted_times.size(); + double median = 0; + if (sample_count % 2 == 1) { + median = sorted_times[sample_count / 2]; + } else { + median = + (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) + / 2; + } + aggregated_query_times.push_back(median); + } + } else if (aggregation_type == AggregationType::Max) { + for (auto const& times_per_run: values) { + aggregated_query_times.push_back(*std::max_element(times_per_run.begin(), times_per_run.end()) + ); } - aggregated_query_times.push_back(median); - } - } else if (aggregation_type == AggregationType::Max) { - for (auto const& query_times: times_per_query) { - aggregated_query_times.push_back(*std::max_element(query_times.begin(), query_times.end()) - ); } + std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); + return aggregated_query_times; } - std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); - return aggregated_query_times; -} -auto summarize( - std::vector> const& times_per_query, AggregationType agg_type -) -> QueryTimesSummary -{ - auto query_times = aggregate_query_times(agg_type, times_per_query); + auto summarize(AggregationType agg_type) const -> QueryTimesSummary + { + auto aggregated_times = aggregate(agg_type); + + double mean = std::accumulate(aggregated_times.begin(), aggregated_times.end(), double()) + / aggregated_times.size(); + double q50 = aggregated_times[aggregated_times.size() / 2]; + double q90 = aggregated_times[90 * aggregated_times.size() / 100]; + double q95 = aggregated_times[95 * aggregated_times.size() / 100]; + double q99 = aggregated_times[99 * aggregated_times.size() / 100]; - double mean = - std::accumulate(query_times.begin(), query_times.end(), double()) / query_times.size(); - double q50 = query_times[query_times.size() / 2]; - double q90 = query_times[90 * query_times.size() / 100]; - double q95 = query_times[95 * query_times.size() / 100]; - double q99 = query_times[99 * query_times.size() / 100]; + return {agg_type, mean, q50, q90, q95, q99}; + } - return {agg_type, mean, q50, q90, q95, q99}; -} +}; template -void extract_times( +auto extract_times( Fn query_func, std::vector const& queries, std::vector const& thresholds, - std::string const& index_type, - std::string const& query_type, size_t runs, std::uint64_t k, - bool safe, - std::ostream* os = nullptr -) { - std::vector> times_per_query( - queries.size(), std::vector(runs) - ); - std::size_t corrective_rerun_count = 0; + bool safe +) -> QueryTimes { + QueryTimes query_times{ + std::vector>(queries.size(), std::vector(runs)), 0}; // Note: each query is measured once per run, so the set of queries is // measured independently in each run. @@ -178,46 +176,70 @@ void extract_times( auto usecs = run_with_timer([&]() { uint64_t result = query_func(query, thresholds[query_idx]); if (safe && result < k) { - corrective_rerun_count += 1; + query_times.corrective_rerun_count += 1; result = query_func(query, 0); } do_not_optimize_away(result); }); if (run != 0) { // first run is not timed - times_per_query[query_idx][run - 1] = usecs.count(); + query_times.values[query_idx][run - 1] = usecs.count(); } } } - // Print JSON summary + return query_times; +} + +void print_summary( + QueryTimes const& query_times, + std::string const& index_type, + std::string const& query_type, + size_t runs, + std::uint64_t k, + bool safe +) { nlohmann::json summary; summary["encoding"] = index_type; summary["algorithm"] = query_type; summary["runs"] = runs; summary["k"] = k; summary["safe"] = safe; - summary["corrective_reruns"] = corrective_rerun_count; + summary["corrective_reruns"] = query_times.corrective_rerun_count; summary["times"] = nlohmann::json::array(); - summary["times"].push_back(summarize(times_per_query, AggregationType::None).to_json()); - summary["times"].push_back(summarize(times_per_query, AggregationType::Min).to_json()); - summary["times"].push_back(summarize(times_per_query, AggregationType::Mean).to_json()); - summary["times"].push_back(summarize(times_per_query, AggregationType::Median).to_json()); - summary["times"].push_back(summarize(times_per_query, AggregationType::Max).to_json()); + summary["times"].push_back( + query_times.summarize(AggregationType::None).to_json() + ); + summary["times"].push_back( + query_times.summarize(AggregationType::Min).to_json() + ); + summary["times"].push_back( + query_times.summarize(AggregationType::Mean).to_json() + ); + summary["times"].push_back( + query_times.summarize(AggregationType::Median).to_json() + ); + summary["times"].push_back( + query_times.summarize(AggregationType::Max).to_json() + ); std::cout << summary.dump(2) << "\n"; +} - // Save times per query (if required) - if (os != nullptr) { - for (auto&& [query_idx, query]: enumerate(queries)) { - for (auto&& [run_idx, time]: enumerate(times_per_query[query_idx])) { - *os << fmt::format( - "{}\t{}\t{}\t{}\n", - query_type, - query.id().value_or(std::to_string(query_idx)), - run_idx + 1, - time - ); - } +void print_times( + QueryTimes const& query_times, + std::vector const& queries, + std::string const& query_type, + std::ostream& os +) { + for (auto&& [query_idx, query]: enumerate(queries)) { + for (auto&& [run_idx, time]: enumerate(query_times.values[query_idx])) { + os << fmt::format( + "{}\t{}\t{}\t{}\n", + query_type, + query.id().value_or(std::to_string(query_idx)), + run_idx + 1, + time + ); } } } @@ -405,7 +427,11 @@ void perftest( spdlog::error("Unsupported query type: {}", t); break; } - extract_times(query_fun, queries, thresholds, type, t, runs, k, safe, output); + auto query_times = extract_times(query_fun, queries, thresholds, runs, k, safe); + print_summary(query_times, type, t, runs, k, safe); + if (output != nullptr) { + print_times(query_times, queries, t, *output); + } } } From 0ff184a18a38156d19bf0070733e615ead2667b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Mon, 5 Jan 2026 18:48:52 -0300 Subject: [PATCH 37/55] Move `to_string` in `AggregationType` --- tools/queries.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index e2a8285f..dd77cae4 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -59,19 +59,28 @@ using namespace pisa; using ranges::views::enumerate; -enum class AggregationType { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; +class AggregationType { +public: + enum Value { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; -[[nodiscard]] auto to_string(AggregationType type) -> std::string { - switch (type) { - case AggregationType::None: return "none"; - case AggregationType::Min: return "min"; - case AggregationType::Mean: return "mean"; - case AggregationType::Median: return "median"; - case AggregationType::Max: return "max"; + constexpr AggregationType(Value value) : m_value(value) {} + constexpr operator Value() const { return m_value; } + + [[nodiscard]] auto to_string() const -> std::string { + switch (m_value) { + case None: return "none"; + case Min: return "min"; + case Mean: return "mean"; + case Median: return "median"; + case Max: return "max"; + } + throw std::logic_error("Unknown AggregationType"); } - throw std::logic_error("Unknown AggregationType"); -} + +private: + Value m_value; +}; struct QueryTimesSummary { AggregationType aggregation_type; @@ -83,7 +92,7 @@ struct QueryTimesSummary { [[nodiscard]] auto to_json() const -> nlohmann::json { - return {{"query_aggregation", to_string(aggregation_type)}, + return {{"query_aggregation", aggregation_type.to_string()}, {"mean", mean}, {"q50", q50}, {"q90", q90}, From 99837220d6666b4f51ef939168c39d1f580fb218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 6 Jan 2026 00:13:41 -0300 Subject: [PATCH 38/55] Refactor output file handling --- tools/queries.cpp | 59 +++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index dd77cae4..bfe66eb0 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -238,11 +238,12 @@ void print_times( QueryTimes const& query_times, std::vector const& queries, std::string const& query_type, - std::ostream& os + std::ostream& output_stream ) { + output_stream << "algorithm\tqid\trun\tusec\n"; for (auto&& [query_idx, query]: enumerate(queries)) { for (auto&& [run_idx, time]: enumerate(query_times.values[query_idx])) { - os << fmt::format( + output_stream << fmt::format( "{}\t{}\t{}\t{}\n", query_type, query.id().value_or(std::to_string(query_idx)), @@ -253,6 +254,22 @@ void print_times( } } +auto open_output_file(std::optional const& output_path) + -> std::optional +{ + if (!output_path) { + return std::nullopt; + } + + std::ofstream out(*output_path); + if (!out.is_open()) { + const auto err_msg = fmt::format("Failed to open output file: {}.", *output_path); + throw std::runtime_error(err_msg); + } + + return out; +} + template void perftest( IndexType const* index_ptr, @@ -266,7 +283,7 @@ void perftest( const bool weighted, bool safe, std::size_t runs, - std::optional const& output_path + std::ostream* output_stream ) { auto const& index = *index_ptr; spdlog::info("Warming up posting lists..."); @@ -303,22 +320,6 @@ void perftest( auto scorer = scorer::from_params(scorer_params, wdata); - std::ofstream output_file; - std::ostream* output = nullptr; - if (output_path) { - output_file.open(*output_path); - if (!output_file.is_open()) { - const auto err_msg = fmt::format("Failed to open output file: {}.", *output_path); - spdlog::error(err_msg); - throw std::runtime_error(err_msg); - } - output = &output_file; - - // Add header - output_file << "algorithm\tqid\trun\tusec\n"; - - spdlog::info("Per-run query output will be saved to '{}'.", *output_path); - } for (std::size_t query_type_idx = 0; query_type_idx < query_types.size(); ++query_type_idx) { auto const& t = query_types[query_type_idx]; spdlog::info("Performing {} runs for '{}' queries...", runs, t); @@ -438,8 +439,8 @@ void perftest( } auto query_times = extract_times(query_fun, queries, thresholds, runs, k, safe); print_summary(query_times, type, t, runs, k, safe); - if (output != nullptr) { - print_times(query_times, queries, t, *output); + if (output_stream) { + print_times(query_times, queries, t, *output_stream); } } } @@ -476,6 +477,20 @@ int main(int argc, const char** argv) { std::vector query_types; boost::algorithm::split(query_types, app.algorithm(), boost::is_any_of(":")); + // If required, attempt to open the output file + std::optional output_file; + std::ostream* output_stream = nullptr; + try { + output_file = open_output_file(output_path); + if (output_file.has_value()) { + output_stream = &*output_file; + spdlog::info("Per-run query output will be saved to '{}'.", *output_path); + } + } catch (std::exception const& e) { + spdlog::error("{}", e.what()); + return EXIT_FAILURE; + } + run_for_index(app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { using Index = std::decay_t; auto params = std::make_tuple( @@ -490,7 +505,7 @@ int main(int argc, const char** argv) { app.weighted(), safe, runs, - output_path + output_stream ); if (app.is_wand_compressed()) { if (quantized) { From c5825f4fb8c8719f1520fbf1ee195ff7a729cf6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 6 Jan 2026 00:22:11 -0300 Subject: [PATCH 39/55] Format `queries.cpp` and `app.cpp` --- tools/app.cpp | 2 +- tools/queries.cpp | 112 +++++++++++++++++++++------------------------- 2 files changed, 52 insertions(+), 62 deletions(-) diff --git a/tools/app.cpp b/tools/app.cpp index e12b07a6..cdaed4f3 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -91,7 +91,7 @@ Algorithm::Algorithm(CLI::App* app) { "-a,--algorithm", m_algorithm, "Query processing algorithm (use ':' to separate multiple algorithms)" - ) + ) ->required(); } diff --git a/tools/queries.cpp b/tools/queries.cpp index bfe66eb0..46c33979 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -61,7 +61,7 @@ using namespace pisa; using ranges::views::enumerate; class AggregationType { -public: + public: enum Value { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; constexpr AggregationType(Value value) : m_value(value) {} @@ -78,7 +78,7 @@ class AggregationType { throw std::logic_error("Unknown AggregationType"); } -private: + private: Value m_value; }; @@ -90,14 +90,15 @@ struct QueryTimesSummary { double q95; double q99; - [[nodiscard]] auto to_json() const -> nlohmann::json - { - return {{"query_aggregation", aggregation_type.to_string()}, - {"mean", mean}, - {"q50", q50}, - {"q90", q90}, - {"q95", q95}, - {"q99", q99}}; + [[nodiscard]] auto to_json() const -> nlohmann::json { + return { + {"query_aggregation", aggregation_type.to_string()}, + {"mean", mean}, + {"q50", q50}, + {"q90", q90}, + {"q95", q95}, + {"q99", q99} + }; } }; @@ -105,8 +106,7 @@ struct QueryTimes { std::vector> values; std::size_t corrective_rerun_count; - auto aggregate(AggregationType aggregation_type) const -> std::vector - { + auto aggregate(AggregationType aggregation_type) const -> std::vector { std::vector aggregated_query_times; if (aggregation_type == AggregationType::None) { for (auto const& times_per_run: values) { @@ -116,7 +116,8 @@ struct QueryTimes { } } else if (aggregation_type == AggregationType::Min) { for (auto const& times_per_run: values) { - aggregated_query_times.push_back(*std::min_element(times_per_run.begin(), times_per_run.end()) + aggregated_query_times.push_back( + *std::min_element(times_per_run.begin(), times_per_run.end()) ); } } else if (aggregation_type == AggregationType::Mean) { @@ -135,14 +136,14 @@ struct QueryTimes { median = sorted_times[sample_count / 2]; } else { median = - (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) - / 2; + (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) / 2; } aggregated_query_times.push_back(median); } } else if (aggregation_type == AggregationType::Max) { for (auto const& times_per_run: values) { - aggregated_query_times.push_back(*std::max_element(times_per_run.begin(), times_per_run.end()) + aggregated_query_times.push_back( + *std::max_element(times_per_run.begin(), times_per_run.end()) ); } } @@ -150,8 +151,7 @@ struct QueryTimes { return aggregated_query_times; } - auto summarize(AggregationType agg_type) const -> QueryTimesSummary - { + auto summarize(AggregationType agg_type) const -> QueryTimesSummary { auto aggregated_times = aggregate(agg_type); double mean = std::accumulate(aggregated_times.begin(), aggregated_times.end(), double()) @@ -163,7 +163,6 @@ struct QueryTimes { return {agg_type, mean, q50, q90, q95, q99}; } - }; template @@ -176,7 +175,8 @@ auto extract_times( bool safe ) -> QueryTimes { QueryTimes query_times{ - std::vector>(queries.size(), std::vector(runs)), 0}; + std::vector>(queries.size(), std::vector(runs)), 0 + }; // Note: each query is measured once per run, so the set of queries is // measured independently in each run. @@ -216,21 +216,11 @@ void print_summary( summary["corrective_reruns"] = query_times.corrective_rerun_count; summary["times"] = nlohmann::json::array(); - summary["times"].push_back( - query_times.summarize(AggregationType::None).to_json() - ); - summary["times"].push_back( - query_times.summarize(AggregationType::Min).to_json() - ); - summary["times"].push_back( - query_times.summarize(AggregationType::Mean).to_json() - ); - summary["times"].push_back( - query_times.summarize(AggregationType::Median).to_json() - ); - summary["times"].push_back( - query_times.summarize(AggregationType::Max).to_json() - ); + summary["times"].push_back(query_times.summarize(AggregationType::None).to_json()); + summary["times"].push_back(query_times.summarize(AggregationType::Min).to_json()); + summary["times"].push_back(query_times.summarize(AggregationType::Mean).to_json()); + summary["times"].push_back(query_times.summarize(AggregationType::Median).to_json()); + summary["times"].push_back(query_times.summarize(AggregationType::Max).to_json()); std::cout << summary.dump(2) << "\n"; } @@ -254,9 +244,7 @@ void print_times( } } -auto open_output_file(std::optional const& output_path) - -> std::optional -{ +auto open_output_file(std::optional const& output_path) -> std::optional { if (!output_path) { return std::nullopt; } @@ -491,30 +479,32 @@ int main(int argc, const char** argv) { return EXIT_FAILURE; } - run_for_index(app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { - using Index = std::decay_t; - auto params = std::make_tuple( - &index, - app.wand_data_path(), - app.queries(), - app.thresholds_file(), - app.index_encoding(), - query_types, - app.k(), - app.scorer_params(), - app.weighted(), - safe, - runs, - output_stream - ); - if (app.is_wand_compressed()) { - if (quantized) { - std::apply(perftest, params); + run_for_index( + app.index_encoding(), MemorySource::mapped_file(app.index_filename()), [&](auto index) { + using Index = std::decay_t; + auto params = std::make_tuple( + &index, + app.wand_data_path(), + app.queries(), + app.thresholds_file(), + app.index_encoding(), + query_types, + app.k(), + app.scorer_params(), + app.weighted(), + safe, + runs, + output_stream + ); + if (app.is_wand_compressed()) { + if (quantized) { + std::apply(perftest, params); + } else { + std::apply(perftest, params); + } } else { - std::apply(perftest, params); + std::apply(perftest, params); } - } else { - std::apply(perftest, params); } - }); + ); } From f357034e80a09f2c5937e4affe35f479e3929e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Tue, 6 Jan 2026 00:50:07 -0300 Subject: [PATCH 40/55] Make `AggregationType` constructor explicit --- tools/queries.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 46c33979..2f4fb4e1 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -64,7 +64,7 @@ class AggregationType { public: enum Value { None = 0, Min = 1, Mean = 2, Median = 3, Max = 4 }; - constexpr AggregationType(Value value) : m_value(value) {} + explicit constexpr AggregationType(Value value) : m_value(value) {} constexpr operator Value() const { return m_value; } [[nodiscard]] auto to_string() const -> std::string { @@ -216,11 +216,14 @@ void print_summary( summary["corrective_reruns"] = query_times.corrective_rerun_count; summary["times"] = nlohmann::json::array(); - summary["times"].push_back(query_times.summarize(AggregationType::None).to_json()); - summary["times"].push_back(query_times.summarize(AggregationType::Min).to_json()); - summary["times"].push_back(query_times.summarize(AggregationType::Mean).to_json()); - summary["times"].push_back(query_times.summarize(AggregationType::Median).to_json()); - summary["times"].push_back(query_times.summarize(AggregationType::Max).to_json()); + for (auto agg_type: + {AggregationType::None, + AggregationType::Min, + AggregationType::Mean, + AggregationType::Median, + AggregationType::Max}) { + summary["times"].push_back(query_times.summarize(AggregationType(agg_type)).to_json()); + } std::cout << summary.dump(2) << "\n"; } From 40a3f108ba73efaaaa3c629db5f5f1010992e7af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 7 Jan 2026 17:14:35 -0300 Subject: [PATCH 41/55] Modify algorithm validation to perform it before execution --- tools/app.cpp | 30 +++++++++++++++++++++++++++++- tools/app.hpp | 2 ++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/app.cpp b/tools/app.cpp index cdaed4f3..1abdaa74 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -1,3 +1,6 @@ +#include +#include + #include "app.hpp" #include "type_safe.hpp" @@ -62,6 +65,20 @@ auto Analyzer::text_analyzer() const -> TextAnalyzer { const std::set Analyzer::VALID_TOKENIZERS = {"whitespace", "english"}; const std::set Analyzer::VALID_TOKEN_FILTERS = {"lowercase", "porter2", "krovetz"}; +const std::set Algorithm::VALID_ALGORITHMS = { + "and", + "or", + "or_freq", + "wand", + "block_max_wand", + "block_max_maxscore", + "ranked_and", + "block_max_ranked_and", + "ranked_or", + "maxscore", + "ranked_or_taat", + "ranked_or_taat_lazy" +}; LogLevel::LogLevel(CLI::App* app) { app->add_option("-L,--log-level", m_level, "Log level") @@ -92,7 +109,18 @@ Algorithm::Algorithm(CLI::App* app) { m_algorithm, "Query processing algorithm (use ':' to separate multiple algorithms)" ) - ->required(); + ->required() + ->check([](const std::string& value) -> std::string { + std::vector curr_algorithms; + boost::algorithm::split(curr_algorithms, value, boost::is_any_of(":")); + for (const auto& algorithm: curr_algorithms) { + const bool is_valid = VALID_ALGORITHMS.find(algorithm) != VALID_ALGORITHMS.end(); + if (!is_valid) { + return "Algorithm '" + algorithm + "' is not valid"; + } + } + return ""; + }); } auto Algorithm::algorithm() const -> std::string const& { diff --git a/tools/app.hpp b/tools/app.hpp index 07b0619e..157fa983 100644 --- a/tools/app.hpp +++ b/tools/app.hpp @@ -195,6 +195,8 @@ namespace arg { }; struct Algorithm { + static const std::set VALID_ALGORITHMS; + explicit Algorithm(CLI::App* app); [[nodiscard]] auto algorithm() const -> std::string const&; From a5d33da4f98404bc8f68953442f030f530ff0edf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 7 Jan 2026 17:17:08 -0300 Subject: [PATCH 42/55] Add license comment in `app.cpp` and `app.hpp` --- tools/app.cpp | 14 ++++++++++++++ tools/app.hpp | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tools/app.cpp b/tools/app.cpp index 1abdaa74..f06947a5 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -1,3 +1,17 @@ +// Copyright 2025 PISA Developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include #include diff --git a/tools/app.hpp b/tools/app.hpp index 157fa983..12dfe719 100644 --- a/tools/app.hpp +++ b/tools/app.hpp @@ -1,3 +1,17 @@ +// Copyright 2025 PISA Developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #pragma once #include From a41dc7df1094d0aba63406b1d313832c813f7377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 8 Jan 2026 18:11:16 -0300 Subject: [PATCH 43/55] Modify 'boost' includes to use quoted style in `app.cpp` --- tools/app.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/app.cpp b/tools/app.cpp index f06947a5..b5123cd8 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include +#include "boost/algorithm/string/classification.hpp" +#include "boost/algorithm/string/split.hpp" #include "app.hpp" #include "type_safe.hpp" From 3be83a9364f86cce9b61665e9ab66ea0f8836cb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 8 Jan 2026 18:36:13 -0300 Subject: [PATCH 44/55] Update `querying.md` --- docs/src/guide/querying.md | 41 +++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/docs/src/guide/querying.md b/docs/src/guide/querying.md index 1235997f..dbacf4f1 100644 --- a/docs/src/guide/querying.md +++ b/docs/src/guide/querying.md @@ -1,12 +1,11 @@ # Querying -Now it is possible to query the index. The command `queries` treats each -line of the standard input (or a file if `-q` is present) as a separate -query. A query line contains a whitespace-delimited list of tokens. -These tokens are either interpreted as terms (if `--terms` is defined, -which will be used to resolve term IDs) or as term IDs (if `--terms` is -not defined). Optionally, a query can contain query ID delimited by a -colon: +The command `queries` treats each line of the standard input (or a file +if `-q` is present) as a separate query. A query line contains a +whitespace-delimited list of tokens. These tokens are either interpreted +as terms (if `--terms` is defined, which will be used to resolve term +IDs) or as term IDs (if `--terms` is not defined). Optionally, a query +can contain query ID delimited by a colon: ``` Q1:one two three @@ -28,8 +27,36 @@ operators can be used (see [Query algorithms](#query-algorithms)), and also multiple operators separated by colon (`and:or:wand`), which will run multiple passes, one per algorithm. +The tool outputs a JSON with query execution statistics including mean, median +(`q50`), and percentiles (`q90`, `q95`, `q99`) for different aggregation types +(`none`, `min`, `mean`, `median`, `max`). + If the WAND file is compressed, append `--compressed-wand` flag. +## Supported algorithms + +The following algorithms are available via the `-a` option: + +* `and` +* `or` +* `or_freq` +* `wand` +* `block_max_wand` +* `block_max_maxscore` +* `ranked_and` +* `block_max_ranked_and` +* `ranked_or` +* `maxscore` +* `ranked_or_taat` +* `ranked_or_taat_lazy` + +## Additional options + +* `--runs `: Number of runs per query (default: 3) +* `-o, --output `: Output file for per-run query timing data +* `--safe`: Rerun if not enough results with pruning (requires `--thresholds`) +* `--quantized`: Quantized scores + ## Build additional data To perform BM25 queries it is necessary to build an additional file From 7a2ae18cddad0f997895622d73594eb206a686be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 8 Jan 2026 18:36:41 -0300 Subject: [PATCH 45/55] Format `querying.md` --- docs/src/guide/querying.md | 72 ++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/docs/src/guide/querying.md b/docs/src/guide/querying.md index dbacf4f1..7648ddef 100644 --- a/docs/src/guide/querying.md +++ b/docs/src/guide/querying.md @@ -1,11 +1,10 @@ # Querying -The command `queries` treats each line of the standard input (or a file -if `-q` is present) as a separate query. A query line contains a -whitespace-delimited list of tokens. These tokens are either interpreted -as terms (if `--terms` is defined, which will be used to resolve term -IDs) or as term IDs (if `--terms` is not defined). Optionally, a query -can contain query ID delimited by a colon: +The command `queries` treats each line of the standard input (or a file if `-q` +is present) as a separate query. A query line contains a whitespace-delimited +list of tokens. These tokens are either interpreted as terms (if `--terms` is +defined, which will be used to resolve term IDs) or as term IDs (if `--terms` is +not defined). Optionally, a query can contain query ID delimited by a colon: ``` Q1:one two three @@ -22,10 +21,10 @@ For example: -w test_collection.wand \ # metadata file -q ../test/test_data/queries # query input file -This performs conjunctive queries (`and`). In place of `and` other -operators can be used (see [Query algorithms](#query-algorithms)), and -also multiple operators separated by colon (`and:or:wand`), which will -run multiple passes, one per algorithm. +This performs conjunctive queries (`and`). In place of `and` other operators can +be used (see [Query algorithms](#query-algorithms)), and also multiple operators +separated by colon (`and:or:wand`), which will run multiple passes, one per +algorithm. The tool outputs a JSON with query execution statistics including mean, median (`q50`), and percentiles (`q90`, `q95`, `q99`) for different aggregation types @@ -37,41 +36,40 @@ If the WAND file is compressed, append `--compressed-wand` flag. The following algorithms are available via the `-a` option: -* `and` -* `or` -* `or_freq` -* `wand` -* `block_max_wand` -* `block_max_maxscore` -* `ranked_and` -* `block_max_ranked_and` -* `ranked_or` -* `maxscore` -* `ranked_or_taat` -* `ranked_or_taat_lazy` +- `and` +- `or` +- `or_freq` +- `wand` +- `block_max_wand` +- `block_max_maxscore` +- `ranked_and` +- `block_max_ranked_and` +- `ranked_or` +- `maxscore` +- `ranked_or_taat` +- `ranked_or_taat_lazy` ## Additional options -* `--runs `: Number of runs per query (default: 3) -* `-o, --output `: Output file for per-run query timing data -* `--safe`: Rerun if not enough results with pruning (requires `--thresholds`) -* `--quantized`: Quantized scores +- `--runs `: Number of runs per query (default: 3) +- `-o, --output `: Output file for per-run query timing data +- `--safe`: Rerun if not enough results with pruning (requires `--thresholds`) +- `--quantized`: Quantized scores ## Build additional data -To perform BM25 queries it is necessary to build an additional file -containing the parameters needed to compute the score, such as the -document lengths. The file can be built with the following command: +To perform BM25 queries it is necessary to build an additional file containing +the parameters needed to compute the score, such as the document lengths. The +file can be built with the following command: $ ./bin/create_wand_data \ -c ../test/test_data/test_collection \ -o test_collection.wand -If you want to compress the file append `--compress` at the end of the -command. When using variable-sized blocks (for VBMW) via the -`--variable-block` parameter, you can also specify lambda with the `-l -` or `--lambda ` flags. The value of lambda impacts the -mean size of the variable blocks that are output. See the VBMW paper -(listed below) for more details. If using fixed-sized blocks, which is -the default, you can supply the desired block size using the `-b -` or `--block-size ` arguments. +If you want to compress the file append `--compress` at the end of the command. +When using variable-sized blocks (for VBMW) via the `--variable-block` +parameter, you can also specify lambda with the `-l ` or +`--lambda ` flags. The value of lambda impacts the mean size of the +variable blocks that are output. See the VBMW paper (listed below) for more +details. If using fixed-sized blocks, which is the default, you can supply the +desired block size using the `-b ` or `--block-size ` arguments. From 7e4c58fb6efffa8f2c93bc96dd95cd87f8d5091e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 15 Jan 2026 17:47:52 -0300 Subject: [PATCH 46/55] Remove raw pointer for output file in `queries.cpp` --- tools/queries.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 2f4fb4e1..2ca6780e 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -274,7 +274,7 @@ void perftest( const bool weighted, bool safe, std::size_t runs, - std::ostream* output_stream + std::optional output_file ) { auto const& index = *index_ptr; spdlog::info("Warming up posting lists..."); @@ -430,8 +430,8 @@ void perftest( } auto query_times = extract_times(query_fun, queries, thresholds, runs, k, safe); print_summary(query_times, type, t, runs, k, safe); - if (output_stream) { - print_times(query_times, queries, t, *output_stream); + if (output_file) { + print_times(query_times, queries, t, *output_file); } } } @@ -470,11 +470,9 @@ int main(int argc, const char** argv) { // If required, attempt to open the output file std::optional output_file; - std::ostream* output_stream = nullptr; try { output_file = open_output_file(output_path); if (output_file.has_value()) { - output_stream = &*output_file; spdlog::info("Per-run query output will be saved to '{}'.", *output_path); } } catch (std::exception const& e) { @@ -497,16 +495,18 @@ int main(int argc, const char** argv) { app.weighted(), safe, runs, - output_stream + std::move(output_file) ); if (app.is_wand_compressed()) { if (quantized) { - std::apply(perftest, params); + std::apply( + perftest, std::move(params) + ); } else { - std::apply(perftest, params); + std::apply(perftest, std::move(params)); } } else { - std::apply(perftest, params); + std::apply(perftest, std::move(params)); } } ); From 5c6b44d8276df105296195152cced794777b2537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 15 Jan 2026 17:51:23 -0300 Subject: [PATCH 47/55] Refactor median computation --- tools/queries.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 2ca6780e..cb1cf929 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -131,13 +131,9 @@ struct QueryTimes { auto sorted_times = times_per_run; std::sort(sorted_times.begin(), sorted_times.end()); std::size_t sample_count = sorted_times.size(); - double median = 0; - if (sample_count % 2 == 1) { - median = sorted_times[sample_count / 2]; - } else { - median = - (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) / 2; - } + double median = sample_count % 2 == 1 + ? sorted_times[sample_count / 2] + : (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) / 2; aggregated_query_times.push_back(median); } } else if (aggregation_type == AggregationType::Max) { From ecae3d4a9f8a27a06b1a4d1c672b565235e9f4cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 15 Jan 2026 18:04:53 -0300 Subject: [PATCH 48/55] Refactor aggregation for type `None` --- tools/queries.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index cb1cf929..4ccf1a27 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -109,10 +109,10 @@ struct QueryTimes { auto aggregate(AggregationType aggregation_type) const -> std::vector { std::vector aggregated_query_times; if (aggregation_type == AggregationType::None) { - for (auto const& times_per_run: values) { - for (auto t: times_per_run) { - aggregated_query_times.push_back(t); - } + for (auto const& times_per_run : values) { + aggregated_query_times.insert( + aggregated_query_times.end(), times_per_run.begin(), times_per_run.end() + ); } } else if (aggregation_type == AggregationType::Min) { for (auto const& times_per_run: values) { From b6fd578d0091e1c8b29f728f925f4e15e858fb07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 15 Jan 2026 18:09:19 -0300 Subject: [PATCH 49/55] Move sorting from `aggregate` to `summarize` --- tools/queries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 4ccf1a27..585528e4 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -143,12 +143,12 @@ struct QueryTimes { ); } } - std::sort(aggregated_query_times.begin(), aggregated_query_times.end()); return aggregated_query_times; } auto summarize(AggregationType agg_type) const -> QueryTimesSummary { auto aggregated_times = aggregate(agg_type); + std::sort(aggregated_times.begin(), aggregated_times.end()); double mean = std::accumulate(aggregated_times.begin(), aggregated_times.end(), double()) / aggregated_times.size(); From a792f2f2be879e916beb0e04767432c6d5e3709a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Thu, 15 Jan 2026 18:49:49 -0300 Subject: [PATCH 50/55] Refactor `aggregate` by splitting logic --- tools/queries.cpp | 99 ++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 39 deletions(-) diff --git a/tools/queries.cpp b/tools/queries.cpp index 585528e4..56d39750 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -106,44 +106,67 @@ struct QueryTimes { std::vector> values; std::size_t corrective_rerun_count; + auto aggregate_none() const -> std::vector { + std::vector aggregated; + for (auto const& times_per_run: values) { + aggregated.insert(aggregated.end(), times_per_run.begin(), times_per_run.end()); + } + return aggregated; + } + + auto aggregate_min() const -> std::vector { + std::vector aggregated; + aggregated.reserve(values.size()); + for (auto const& times_per_run: values) { + aggregated.push_back(*std::min_element(times_per_run.begin(), times_per_run.end())); + } + return aggregated; + } + + auto aggregate_mean() const -> std::vector { + std::vector aggregated; + aggregated.reserve(values.size()); + for (auto const& times_per_run: values) { + double sum = std::accumulate(times_per_run.begin(), times_per_run.end(), double()); + double mean = sum / times_per_run.size(); + aggregated.push_back(mean); + } + return aggregated; + } + + auto aggregate_median() const -> std::vector { + std::vector aggregated; + aggregated.reserve(values.size()); + for (auto const& times_per_run: values) { + auto sorted_times = times_per_run; + std::sort(sorted_times.begin(), sorted_times.end()); + std::size_t sample_count = sorted_times.size(); + double median = sample_count % 2 == 1 + ? sorted_times[sample_count / 2] + : (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) / 2; + aggregated.push_back(median); + } + return aggregated; + } + + auto aggregate_max() const -> std::vector { + std::vector aggregated; + aggregated.reserve(values.size()); + for (auto const& times_per_run: values) { + aggregated.push_back(*std::max_element(times_per_run.begin(), times_per_run.end())); + } + return aggregated; + } + auto aggregate(AggregationType aggregation_type) const -> std::vector { - std::vector aggregated_query_times; - if (aggregation_type == AggregationType::None) { - for (auto const& times_per_run : values) { - aggregated_query_times.insert( - aggregated_query_times.end(), times_per_run.begin(), times_per_run.end() - ); - } - } else if (aggregation_type == AggregationType::Min) { - for (auto const& times_per_run: values) { - aggregated_query_times.push_back( - *std::min_element(times_per_run.begin(), times_per_run.end()) - ); - } - } else if (aggregation_type == AggregationType::Mean) { - for (auto const& times_per_run: values) { - double sum = std::accumulate(times_per_run.begin(), times_per_run.end(), double()); - double mean = sum / times_per_run.size(); - aggregated_query_times.push_back(mean); - } - } else if (aggregation_type == AggregationType::Median) { - for (auto const& times_per_run: values) { - auto sorted_times = times_per_run; - std::sort(sorted_times.begin(), sorted_times.end()); - std::size_t sample_count = sorted_times.size(); - double median = sample_count % 2 == 1 - ? sorted_times[sample_count / 2] - : (sorted_times[sample_count / 2] + sorted_times[sample_count / 2 - 1]) / 2; - aggregated_query_times.push_back(median); - } - } else if (aggregation_type == AggregationType::Max) { - for (auto const& times_per_run: values) { - aggregated_query_times.push_back( - *std::max_element(times_per_run.begin(), times_per_run.end()) - ); - } + switch (aggregation_type) { + case AggregationType::None: return aggregate_none(); + case AggregationType::Min: return aggregate_min(); + case AggregationType::Mean: return aggregate_mean(); + case AggregationType::Median: return aggregate_median(); + case AggregationType::Max: return aggregate_max(); } - return aggregated_query_times; + throw std::logic_error("Unknown AggregationType"); } auto summarize(AggregationType agg_type) const -> QueryTimesSummary { @@ -495,9 +518,7 @@ int main(int argc, const char** argv) { ); if (app.is_wand_compressed()) { if (quantized) { - std::apply( - perftest, std::move(params) - ); + std::apply(perftest, std::move(params)); } else { std::apply(perftest, std::move(params)); } From cee923e020a145380069c68755d1a31bc6d29977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 16 Jan 2026 08:47:02 -0300 Subject: [PATCH 51/55] Modify algorithm parsing to use multiple `-a` options instead of colon separator --- tools/app.cpp | 29 ++++++----------------------- tools/app.hpp | 4 ++-- tools/evaluate_queries.cpp | 4 +--- tools/queries.cpp | 6 +----- tools/tests/test_app.cpp | 14 ++++++++------ 5 files changed, 18 insertions(+), 39 deletions(-) diff --git a/tools/app.cpp b/tools/app.cpp index b5123cd8..6ef8eb4b 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "boost/algorithm/string/classification.hpp" -#include "boost/algorithm/string/split.hpp" - #include "app.hpp" #include "type_safe.hpp" @@ -118,27 +115,13 @@ const std::map LogLevel::ENUM_MAP = { }; Algorithm::Algorithm(CLI::App* app) { - app->add_option( - "-a,--algorithm", - m_algorithm, - "Query processing algorithm (use ':' to separate multiple algorithms)" - ) + app->add_option("-a,--algorithm", m_algorithms, "Query processing algorithm") ->required() - ->check([](const std::string& value) -> std::string { - std::vector curr_algorithms; - boost::algorithm::split(curr_algorithms, value, boost::is_any_of(":")); - for (const auto& algorithm: curr_algorithms) { - const bool is_valid = VALID_ALGORITHMS.find(algorithm) != VALID_ALGORITHMS.end(); - if (!is_valid) { - return "Algorithm '" + algorithm + "' is not valid"; - } - } - return ""; - }); -} - -auto Algorithm::algorithm() const -> std::string const& { - return m_algorithm; + ->check(CLI::IsMember(VALID_ALGORITHMS)); +} + +auto Algorithm::algorithms() const -> std::vector const& { + return m_algorithms; } Quantize::Quantize(CLI::App* app) : m_params("") { diff --git a/tools/app.hpp b/tools/app.hpp index 12dfe719..0e1d51f2 100644 --- a/tools/app.hpp +++ b/tools/app.hpp @@ -212,10 +212,10 @@ namespace arg { static const std::set VALID_ALGORITHMS; explicit Algorithm(CLI::App* app); - [[nodiscard]] auto algorithm() const -> std::string const&; + [[nodiscard]] auto algorithms() const -> std::vector const&; private: - std::string m_algorithm; + std::vector m_algorithms; }; enum class ScorerMode : bool { Required, Optional }; diff --git a/tools/evaluate_queries.cpp b/tools/evaluate_queries.cpp index 92380979..228c01e7 100644 --- a/tools/evaluate_queries.cpp +++ b/tools/evaluate_queries.cpp @@ -2,8 +2,6 @@ #include #include -#include -#include #include #include #include @@ -227,7 +225,7 @@ int main(int argc, const char** argv) { app.queries(), app.thresholds_file(), app.index_encoding(), - app.algorithm(), + app.algorithms().front(), app.k(), documents_file, app.scorer_params(), diff --git a/tools/queries.cpp b/tools/queries.cpp index 56d39750..3b060478 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -20,8 +20,6 @@ #include #include -#include -#include #include #include #include @@ -483,9 +481,7 @@ int main(int argc, const char** argv) { spdlog::set_default_logger(spdlog::stderr_color_mt("stderr")); spdlog::set_level(app.log_level()); - // Parse query types (algorithms) - std::vector query_types; - boost::algorithm::split(query_types, app.algorithm(), boost::is_any_of(":")); + auto const& query_types = app.algorithms(); // If required, attempt to open the output file std::optional output_file; diff --git a/tools/tests/test_app.cpp b/tools/tests/test_app.cpp index abf9cae5..0c628ce3 100644 --- a/tools/tests/test_app.cpp +++ b/tools/tests/test_app.cpp @@ -397,14 +397,16 @@ TEST_CASE("Algorithm", "[cli]") { REQUIRE_THROWS(parse(app, {})); } SECTION("Long option") { - // Note: algorithm names are not validated until later. - parse(app, {"--algorithm", "ALG"}); - REQUIRE(args.algorithm() == "ALG"); + parse(app, {"--algorithm", "and"}); + REQUIRE(args.algorithms() == std::vector{"and"}); } SECTION("Short option") { - // Note: algorithm names are not validated until later. - parse(app, {"-a", "ALG"}); - REQUIRE(args.algorithm() == "ALG"); + parse(app, {"-a", "or"}); + REQUIRE(args.algorithms() == std::vector{"or"}); + } + SECTION("Multiple algorithms") { + parse(app, {"-a", "wand", "-a", "and"}); + REQUIRE(args.algorithms() == std::vector{"wand", "and"}); } } From 9b24e4f0efafe141909bd205d9afc3c535c0042b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 16 Jan 2026 15:19:53 -0300 Subject: [PATCH 52/55] Add validation for algorithms requiring WAND data --- tools/app.cpp | 53 +++++++++++++++++++++++++++++----------- tools/app.hpp | 3 ++- tools/tests/test_app.cpp | 20 +++++++++++++++ 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/tools/app.cpp b/tools/app.cpp index 6ef8eb4b..3f97e61f 100644 --- a/tools/app.cpp +++ b/tools/app.cpp @@ -76,19 +76,21 @@ auto Analyzer::text_analyzer() const -> TextAnalyzer { const std::set Analyzer::VALID_TOKENIZERS = {"whitespace", "english"}; const std::set Analyzer::VALID_TOKEN_FILTERS = {"lowercase", "porter2", "krovetz"}; -const std::set Algorithm::VALID_ALGORITHMS = { - "and", - "or", - "or_freq", - "wand", - "block_max_wand", - "block_max_maxscore", - "ranked_and", - "block_max_ranked_and", - "ranked_or", - "maxscore", - "ranked_or_taat", - "ranked_or_taat_lazy" + +// algorithm -> requires_wand_data +const std::map Algorithm::VALID_ALGORITHMS = { + {"and", false}, + {"or", false}, + {"or_freq", false}, + {"wand", true}, + {"block_max_wand", true}, + {"block_max_maxscore", true}, + {"ranked_and", false}, + {"block_max_ranked_and", true}, + {"ranked_or", false}, + {"maxscore", true}, + {"ranked_or_taat", false}, + {"ranked_or_taat_lazy", false} }; LogLevel::LogLevel(CLI::App* app) { @@ -117,7 +119,30 @@ const std::map LogLevel::ENUM_MAP = { Algorithm::Algorithm(CLI::App* app) { app->add_option("-a,--algorithm", m_algorithms, "Query processing algorithm") ->required() - ->check(CLI::IsMember(VALID_ALGORITHMS)); + ->check([](const std::string& algorithm) -> std::string { + const bool is_valid = VALID_ALGORITHMS.find(algorithm) != VALID_ALGORITHMS.end(); + if (!is_valid) { + return "Algorithm '" + algorithm + "' is not valid"; + } + + return ""; + }); + + // Check if WAND data is provided when it is required by an algorithm. + if (auto* wand_option = app->get_option_no_throw("--wand")) { + app->callback([this, &wand_opt = *wand_option]() { + if (!wand_opt) { + for (const auto& algorithm: m_algorithms) { + if (VALID_ALGORITHMS.at(algorithm)) { + throw CLI::ValidationError( + "Algorithm '" + algorithm + + "' requires WAND data but it was not provided" + ); + } + } + } + }); + } } auto Algorithm::algorithms() const -> std::vector const& { diff --git a/tools/app.hpp b/tools/app.hpp index 0e1d51f2..7df1d227 100644 --- a/tools/app.hpp +++ b/tools/app.hpp @@ -209,7 +209,8 @@ namespace arg { }; struct Algorithm { - static const std::set VALID_ALGORITHMS; + // algorithm -> requires_wand_data + static const std::map VALID_ALGORITHMS; explicit Algorithm(CLI::App* app); [[nodiscard]] auto algorithms() const -> std::vector const&; diff --git a/tools/tests/test_app.cpp b/tools/tests/test_app.cpp index 0c628ce3..a2c38d94 100644 --- a/tools/tests/test_app.cpp +++ b/tools/tests/test_app.cpp @@ -410,6 +410,26 @@ TEST_CASE("Algorithm", "[cli]") { } } +TEST_CASE("Algorithm requires WAND data", "[cli]") { + CLI::App app("Algorithm WAND test"); + pisa::Args, pisa::arg::Algorithm> args(&app); + SECTION("Algorithm not requiring WAND without WAND data succeeds") { + REQUIRE_NOTHROW(parse(app, {"-a", "and"})); + } + SECTION("Algorithm requiring WAND without WAND data throws") { + REQUIRE_THROWS(parse(app, {"-a", "wand"})); + } + SECTION("Algorithm requiring WAND with WAND data succeeds") { + REQUIRE_NOTHROW(parse(app, {"-a", "wand", "-w", "WDATA"})); + } + SECTION("Multiple algorithms with one requiring WAND without WAND data throws") { + REQUIRE_THROWS(parse(app, {"-a", "and", "-a", "maxscore"})); + } + SECTION("Multiple algorithms with one requiring WAND with WAND data succeeds") { + REQUIRE_NOTHROW(parse(app, {"-a", "and", "-a", "maxscore", "-w", "WDATA"})); + } +} + TEST_CASE("Scorer", "[cli]") { CLI::App app("Scorer test"); pisa::Args args(&app); From 200397bf98b519c0b5ac312a13ff58d68d2b3e47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Fri, 16 Jan 2026 16:01:19 -0300 Subject: [PATCH 53/55] Modify `evaluate_queries` to allow only one algorithm at a time --- tools/evaluate_queries.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/evaluate_queries.cpp b/tools/evaluate_queries.cpp index 228c01e7..795eb610 100644 --- a/tools/evaluate_queries.cpp +++ b/tools/evaluate_queries.cpp @@ -206,6 +206,11 @@ int main(int argc, const char** argv) { CLI11_PARSE(app, argc, argv); + if (app.algorithms().size() > 1) { + spdlog::error("Only one algorithm (query type) is allowed at a time."); + return 1; + } + spdlog::set_level(app.log_level()); tbb::global_control control(tbb::global_control::max_allowed_parallelism, app.threads() + 1); spdlog::info("Number of worker threads: {}", app.threads()); From ab79582ccafe192f5c8dbbc36f9dd87a230f4d49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Wed, 21 Jan 2026 23:11:43 -0300 Subject: [PATCH 54/55] Update `queries.md` and `querying.md` --- docs/src/cli/queries.md | 8 ++++---- docs/src/guide/querying.md | 9 ++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/src/cli/queries.md b/docs/src/cli/queries.md index 4338dc59..5d9015ac 100644 --- a/docs/src/cli/queries.md +++ b/docs/src/cli/queries.md @@ -8,11 +8,11 @@ ## Description -Runs query benchmarks. +Runs query benchmarks focused on performance measurement, executing each query +on the given index multiple times. Then, it aggregates statistics across all +queries. -Executes each query on the given index multiple times, and takes the -minimum of those as the final value. Then, it aggregates statistics -across all queries. +Note: for retrieval results use `evaluate_queries`. ## Input diff --git a/docs/src/guide/querying.md b/docs/src/guide/querying.md index 7648ddef..17ddfec9 100644 --- a/docs/src/guide/querying.md +++ b/docs/src/guide/querying.md @@ -22,13 +22,12 @@ For example: -q ../test/test_data/queries # query input file This performs conjunctive queries (`and`). In place of `and` other operators can -be used (see [Query algorithms](#query-algorithms)), and also multiple operators -separated by colon (`and:or:wand`), which will run multiple passes, one per -algorithm. +be used (see [Query algorithms](#query-algorithms)). To run multiple algorithms, +provide `-a` multiple times (for example, `-a and -a or -a wand`). The tool outputs a JSON with query execution statistics including mean, median -(`q50`), and percentiles (`q90`, `q95`, `q99`) for different aggregation types -(`none`, `min`, `mean`, `median`, `max`). +(`q50`), and percentiles (`q90`, `q95`, `q99`) for different _per-query_ +aggregation (`none`, `min`, `mean`, `median`, `max`). If the WAND file is compressed, append `--compressed-wand` flag. From 8f0dff6b3b147c995e3d48e79cbf36f27707dd49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agust=C3=ADn=20Gonz=C3=A1lez?= Date: Mon, 26 Jan 2026 23:48:05 -0300 Subject: [PATCH 55/55] Regenerate documentation --- docs/book/404.html | 109 ++-- docs/book/book.js | 670 ++++++++------------- docs/book/cli/compress_inverted_index.html | 109 ++-- docs/book/cli/compute_intersection.html | 109 ++-- docs/book/cli/count-postings.html | 109 ++-- docs/book/cli/create_wand_data.html | 109 ++-- docs/book/cli/evaluate_queries.html | 111 ++-- docs/book/cli/extract-maxscores.html | 109 ++-- docs/book/cli/extract_topics.html | 109 ++-- docs/book/cli/invert.html | 113 ++-- docs/book/cli/kth_threshold.html | 109 ++-- docs/book/cli/lexicon.html | 109 ++-- docs/book/cli/map_queries.html | 109 ++-- docs/book/cli/parse_collection.html | 109 ++-- docs/book/cli/partition_fwd_index.html | 109 ++-- docs/book/cli/queries.html | 122 ++-- docs/book/cli/read_collection.html | 109 ++-- docs/book/cli/reorder-docids.html | 109 ++-- docs/book/cli/sample_inverted_index.html | 109 ++-- docs/book/cli/selective_queries.html | 109 ++-- docs/book/cli/shards.html | 109 ++-- docs/book/cli/stem_queries.html | 109 ++-- docs/book/cli/taily-stats.html | 109 ++-- docs/book/cli/taily-thresholds.html | 109 ++-- docs/book/cli/thresholds.html | 109 ++-- docs/book/css/chrome.css | 147 +---- docs/book/css/general.css | 64 +- docs/book/css/variables.css | 62 +- docs/book/fonts/fonts.css | 22 +- docs/book/guide/algorithms.html | 109 ++-- docs/book/guide/compressing.html | 109 ++-- docs/book/guide/indexing-pipeline.html | 109 ++-- docs/book/guide/installation.html | 109 ++-- docs/book/guide/inverting.html | 109 ++-- docs/book/guide/parsing.html | 109 ++-- docs/book/guide/querying.html | 173 +++--- docs/book/guide/reordering.html | 109 ++-- docs/book/guide/requirements.html | 109 ++-- docs/book/guide/sharding.html | 109 ++-- docs/book/guide/threshold-estimation.html | 109 ++-- docs/book/guide/wand_data.html | 109 ++-- docs/book/highlight.css | 1 - docs/book/index.html | 109 ++-- docs/book/introduction.html | 109 ++-- docs/book/print.html | 192 +++--- docs/book/searcher.js | 459 ++++++-------- docs/book/searchindex.js | 2 +- docs/book/searchindex.json | 1 + docs/book/specs/lookup-table.html | 109 ++-- docs/book/toc.html | 32 - docs/book/toc.js | 70 --- docs/book/tomorrow-night.css | 2 - docs/book/tutorial/robust04.html | 109 ++-- 53 files changed, 2776 insertions(+), 3391 deletions(-) create mode 100644 docs/book/searchindex.json delete mode 100644 docs/book/toc.html delete mode 100644 docs/book/toc.js diff --git a/docs/book/404.html b/docs/book/404.html index 49910150..068efddd 100644 --- a/docs/book/404.html +++ b/docs/book/404.html @@ -1,5 +1,5 @@ - + @@ -8,7 +8,7 @@ - + @@ -25,41 +25,26 @@ - - - + + + - - - - - - -
-
-

Keyboard shortcuts

-
-

Press or to navigate between chapters

-

Press S or / to search in the book

-

Press ? to show this help

-

Press Esc to hide this help

-
-
-
+
+ + + + + +
- +