From e3f66582cdf0838fecedc639e73365ba1c6f17ff Mon Sep 17 00:00:00 2001 From: Xinhao Yuan Date: Wed, 5 Nov 2025 12:59:11 -0800 Subject: [PATCH] Add various weight computation methods. PiperOrigin-RevId: 828592996 --- centipede/centipede.cc | 15 ++++++- centipede/centipede.h | 1 + centipede/centipede_flags.inc | 3 ++ centipede/corpus.cc | 35 +++++++++++---- centipede/corpus.h | 19 +++++++-- centipede/corpus_test.cc | 80 +++++++++++++++++++++++++++++++++-- centipede/feature_set.cc | 5 ++- centipede/feature_set.h | 8 ++-- centipede/feature_set_test.cc | 8 ++-- 9 files changed, 146 insertions(+), 28 deletions(-) diff --git a/centipede/centipede.cc b/centipede/centipede.cc index bb3793f4f..8b17cc60a 100644 --- a/centipede/centipede.cc +++ b/centipede/centipede.cc @@ -76,6 +76,7 @@ #include "./centipede/centipede_callbacks.h" #include "./centipede/command.h" #include "./centipede/control_flow.h" +#include "./centipede/corpus.h" #include "./centipede/corpus_io.h" #include "./centipede/coverage.h" #include "./centipede/environment.h" @@ -133,6 +134,17 @@ Centipede::Centipede(const Environment &env, CentipedeCallbacks &user_callbacks, FUZZTEST_CHECK(env_.seed) << "env_.seed must not be zero"; if (!env_.input_filter.empty() && env_.fork_server) input_filter_cmd_.StartForkServer(TemporaryLocalDirPath(), "input_filter"); + if (env_.corpus_weight_method == Corpus::kWeightMethodNameForUniform) { + corpus_weight_method_ = Corpus::WeightMethod::Uniform; + } else if (env_.corpus_weight_method == Corpus::kWeightMethodNameForRecency) { + corpus_weight_method_ = Corpus::WeightMethod::Recency; + } else if (env_.corpus_weight_method == + Corpus::kWeightMethodNameForFeatureRarity) { + corpus_weight_method_ = Corpus::WeightMethod::FeatureRarity; + } else { + FUZZTEST_LOG(FATAL) << "Unknown corpus weight method " + << env_.corpus_weight_method; + } } void Centipede::CorpusToFiles(const Environment &env, std::string_view dir) { @@ -474,7 +486,8 @@ bool Centipede::RunBatch( } } } - corpus_.UpdateWeights(fs_, coverage_frontier_, env_.exec_time_weight_scaling); + corpus_.UpdateWeights(fs_, coverage_frontier_, corpus_weight_method_, + env_.exec_time_weight_scaling); return batch_gained_new_coverage; } diff --git a/centipede/centipede.h b/centipede/centipede.h index 24416c66d..8f8cf6658 100644 --- a/centipede/centipede.h +++ b/centipede/centipede.h @@ -185,6 +185,7 @@ class Centipede { FeatureSet fs_; Corpus corpus_; + Corpus::WeightMethod corpus_weight_method_; CoverageFrontier coverage_frontier_; size_t num_runs_ = 0; // counts executed inputs diff --git a/centipede/centipede_flags.inc b/centipede/centipede_flags.inc index 572e56d93..1ab484b36 100644 --- a/centipede/centipede_flags.inc +++ b/centipede/centipede_flags.inc @@ -192,6 +192,9 @@ CENTIPEDE_FLAG( bool, use_corpus_weights, true, "If true, use weighted distribution when choosing the corpus element " "to mutate. This flag is mostly for Centipede developers.") +CENTIPEDE_FLAG(std::string, corpus_weight_method, "feature_rarity", + "The weight method to use on corpus. Available options are " + "`uniform`, `recency`, and `feature_rarity` (default).") CENTIPEDE_FLAG( bool, exec_time_weight_scaling, true, "If true, scale the corpus weight by the execution time of each input.") diff --git a/centipede/corpus.cc b/centipede/corpus.cc index c2746910a..7f874f76e 100644 --- a/centipede/corpus.cc +++ b/centipede/corpus.cc @@ -45,13 +45,12 @@ namespace fuzztest::internal { // Corpus //------------------------------------------------------------------------------ -// Returns the weight of `fv` computed using `fs` and `coverage_frontier`. -static size_t ComputeWeight(const FeatureVec &fv, const FeatureSet &fs, - const CoverageFrontier &coverage_frontier) { - size_t weight = fs.ComputeWeight(fv); +// Returns the weight of `fv` computed using `coverage_frontier`. +static size_t ComputeFrontierWeight(const FeatureVec& fv, + const CoverageFrontier& coverage_frontier) { // The following is checking for the cases where PCTable is not present. In // such cases, we cannot use any ControlFlow related features. - if (coverage_frontier.MaxPcIndex() == 0) return weight; + if (coverage_frontier.MaxPcIndex() == 0) return 1; size_t frontier_weights_sum = 0; for (const auto feature : fv) { if (!feature_domains::kPCs.Contains(feature)) continue; @@ -63,7 +62,7 @@ static size_t ComputeWeight(const FeatureVec &fv, const FeatureSet &fs, frontier_weights_sum += coverage_frontier.FrontierWeight(pc_index); } } - return weight * (frontier_weights_sum + 1); // Multiply by at least 1. + return frontier_weights_sum + 1; // Multiply by at least 1. } std::pair Corpus::MaxAndAvgSize() const { @@ -79,14 +78,31 @@ std::pair Corpus::MaxAndAvgSize() const { void Corpus::UpdateWeights(const FeatureSet& fs, const CoverageFrontier& coverage_frontier, - bool scale_by_exec_time) { + WeightMethod method, bool scale_by_exec_time) { std::vector weights; weights.resize(records_.size()); for (size_t i = 0, n = records_.size(); i < n; ++i) { auto& record = records_[i]; const size_t unseen = fs.PruneFeaturesAndCountUnseen(record.features); FUZZTEST_CHECK_EQ(unseen, 0); - weights[i] = fs.ComputeWeight(record.features); + if (record.features.empty()) { + weights[i] = 0; + continue; + } + switch (method) { + case WeightMethod::Uniform: + weights[i] = 1; + break; + case WeightMethod::Recency: + weights[i] = i + 1; + break; + case WeightMethod::FeatureRarity: + weights[i] = fs.ComputeRarityWeight(record.features); + break; + default: + FUZZTEST_LOG(FATAL) << "Unknown corpus weight method"; + } + weights[i] *= ComputeFrontierWeight(record.features, coverage_frontier); } if (scale_by_exec_time) { double total_exec_time_usec = 0; @@ -206,7 +222,8 @@ void Corpus::Add(const ByteArray& data, const FeatureVec& fv, << "Got request to add empty element to corpus: ignoring"; FUZZTEST_CHECK_EQ(records_.size(), weighted_distribution_.size()); records_.push_back({data, fv, metadata, stats}); - weighted_distribution_.AddWeight(ComputeWeight(fv, fs, coverage_frontier)); + // Will be updated by `UpdateWeights`. + weighted_distribution_.AddWeight(0); } const CorpusRecord& Corpus::WeightedRandom(absl::BitGenRef rng) const { diff --git a/centipede/corpus.h b/centipede/corpus.h index 07164663e..29ded420a 100644 --- a/centipede/corpus.h +++ b/centipede/corpus.h @@ -98,6 +98,17 @@ struct CorpusRecord { // Allows to prune (forget) inputs that become uninteresting. class Corpus { public: + enum class WeightMethod { + Uniform, + Recency, + FeatureRarity, + }; + + static constexpr std::string_view kWeightMethodNameForUniform = "uniform"; + static constexpr std::string_view kWeightMethodNameForRecency = "recency"; + static constexpr std::string_view kWeightMethodNameForFeatureRarity = + "feature_rarity"; + Corpus() = default; Corpus(const Corpus &) = default; @@ -120,12 +131,12 @@ class Corpus { // Returns the number of removed elements. size_t Prune(const FeatureSet &fs, const CoverageFrontier &coverage_frontier, size_t max_corpus_size, Rng &rng); - // Updates the corpus weights according to `fs` and `coverage_frontier`. If - // `scale_by_exec_time` is set, scales the weights by the corpus execution - // time relative to the average. + // Updates the corpus weights according to `fs` and `coverage_frontier` using + // the weight `method`. If `scale_by_exec_time` is set, scales the weights by + // the corpus execution time relative to the average. void UpdateWeights(const FeatureSet& fs, const CoverageFrontier& coverage_frontier, - bool scale_by_exec_time); + WeightMethod method, bool scale_by_exec_time); // Accessors. diff --git a/centipede/corpus_test.cc b/centipede/corpus_test.cc index f816f46ea..4c215a49c 100644 --- a/centipede/corpus_test.cc +++ b/centipede/corpus_test.cc @@ -114,7 +114,9 @@ TEST(Corpus, Prune) { Add({{2}, {30, 40}}); Add({{3}, {40, 50}}); Add({{4}, {10, 20}}); - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false); + corpus.UpdateWeights(fs, coverage_frontier, + Corpus::WeightMethod::FeatureRarity, + /*scale_by_exec_time=*/false); // Prune. Features 20 and 40 are frequent => input {0} will be removed. EXPECT_EQ(corpus.NumActive(), 5); @@ -124,7 +126,9 @@ TEST(Corpus, Prune) { VerifyActiveInputs({{1}, {2}, {3}, {4}}); Add({{5}, {30, 60}}); - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false); + corpus.UpdateWeights(fs, coverage_frontier, + Corpus::WeightMethod::FeatureRarity, + /*scale_by_exec_time=*/false); EXPECT_EQ(corpus.NumTotal(), 6); // Prune. Feature 30 is now frequent => inputs {1} and {2} will be removed. @@ -145,6 +149,64 @@ TEST(Corpus, Prune) { EXPECT_EQ(corpus.NumTotal(), 6); } +TEST(Corpus, WeightMethodsWorkAsExpected) { + PCTable pc_table(100); + CFTable cf_table(100); + BinaryInfo bin_info{pc_table, {}, cf_table, {}, {}, {}}; + CoverageFrontier coverage_frontier(bin_info); + FeatureSet fs(3, {}); + Corpus corpus; + + auto Add = [&](const CorpusRecord& record) { + fs.MergeFeatures(record.features); + corpus.Add(record.data, record.features, /*metadata=*/{}, /*stats=*/{}, fs, + coverage_frontier); + }; + + Add({/*data=*/{0}, /*features=*/{30, 20}}); + Add({/*data=*/{1}, /*features=*/{10, 20}}); + Add({/*data=*/{2}, /*features=*/{10}}); + + constexpr int kNumIter = 10000; + std::vector freq; + + Rng rng; + auto ComputeFreq = [&]() { + freq.clear(); + freq.resize(corpus.NumActive()); + for (int i = 0; i < kNumIter; i++) { + const auto& record = corpus.WeightedRandom(rng); + const auto id = record.data[0]; + ASSERT_LT(id, freq.size()); + freq[id]++; + } + }; + + // The weights should be equal with the uniform method + corpus.UpdateWeights(fs, coverage_frontier, Corpus::WeightMethod::Uniform, + /*scale_by_exec_time=*/false); + ComputeFreq(); + EXPECT_NEAR(freq[0], kNumIter / 3, 100); + EXPECT_NEAR(freq[1], kNumIter / 3, 100); + EXPECT_NEAR(freq[2], kNumIter / 3, 100); + + // The weights should favor {2} over {1} over {0} with the recency method. + corpus.UpdateWeights(fs, coverage_frontier, Corpus::WeightMethod::Recency, + /*scale_by_exec_time=*/false); + ComputeFreq(); + EXPECT_GT(freq[2], freq[1] + 100); + EXPECT_GT(freq[1], freq[0] + 100); + + // The weights should favor {0} over {1} over {2} with the feature rarity + // method. + corpus.UpdateWeights(fs, coverage_frontier, + Corpus::WeightMethod::FeatureRarity, + /*scale_by_exec_time=*/false); + ComputeFreq(); + EXPECT_GT(freq[0], freq[1] + 100); + EXPECT_GT(freq[1], freq[2] + 100); +} + TEST(Corpus, ScalesWeightsWithExecTime) { PCTable pc_table(100); CFTable cf_table(100); @@ -181,14 +243,18 @@ TEST(Corpus, ScalesWeightsWithExecTime) { }; // The weights should be equal without exec time scaling. - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/false); + corpus.UpdateWeights(fs, coverage_frontier, + Corpus::WeightMethod::FeatureRarity, + /*scale_by_exec_time=*/false); ComputeFreq(); EXPECT_NEAR(freq[0], kNumIter / 3, 100); EXPECT_NEAR(freq[1], kNumIter / 3, 100); EXPECT_NEAR(freq[2], kNumIter / 3, 100); // The weights should favor {0} over {1} over {2} with exec time scaling. - corpus.UpdateWeights(fs, coverage_frontier, /*scale_by_exec_time=*/true); + corpus.UpdateWeights(fs, coverage_frontier, + Corpus::WeightMethod::FeatureRarity, + /*scale_by_exec_time=*/true); ComputeFreq(); EXPECT_GT(freq[0], freq[1] + 100); EXPECT_GT(freq[1], freq[2] + 100); @@ -208,6 +274,9 @@ TEST(Corpus, PruneCorpusWithAllEmptyFeatureInputs) { coverage_frontier); corpus.Add(/*data=*/{2}, /*fv=*/{}, /*metadata=*/{}, /*stats=*/{}, fs, coverage_frontier); + corpus.UpdateWeights(fs, coverage_frontier, + Corpus::WeightMethod::FeatureRarity, + /*scale_by_exec_time=*/false); // Should not crash. corpus.Prune(fs, coverage_frontier, max_corpus_size, rng); } @@ -231,6 +300,9 @@ TEST(Corpus, PruneRegressionTest1) { Add({{1}, {10, 20}}); Add({{2}, {10}}); + corpus.UpdateWeights(fs, coverage_frontier, + Corpus::WeightMethod::FeatureRarity, + /*scale_by_exec_time=*/false); corpus.Prune(fs, coverage_frontier, max_corpus_size, rng); } diff --git a/centipede/feature_set.cc b/centipede/feature_set.cc index 6806b2f94..2daf4cec5 100644 --- a/centipede/feature_set.cc +++ b/centipede/feature_set.cc @@ -139,8 +139,9 @@ void FeatureSet::MergeFeatures(const FeatureVec& features) { } __attribute__((noinline)) // to see it in profile. -uint64_t -FeatureSet::ComputeWeight(const FeatureVec &features) const { +double FeatureSet::ComputeRarityWeight(const FeatureVec& features) const { + // Use uint64_t to keep the previous behavior. Maybe we want to switch it to + // double later. uint64_t weight = 0; for (auto feature : features) { // The less frequent is the feature, the more valuable it is. diff --git a/centipede/feature_set.h b/centipede/feature_set.h index 7e85dfef4..beaa1c51b 100644 --- a/centipede/feature_set.h +++ b/centipede/feature_set.h @@ -94,10 +94,10 @@ class FeatureSet { return frequencies_[feature]; } - // Computes combined weight of `features`. - // The less frequent the feature is, the bigger its weight. - // The weight of a FeatureVec is a sum of individual feature weights. - uint64_t ComputeWeight(const FeatureVec &features) const; + // Computes combined weight of `features` based on the feature rarity that + // scales linearly. The less frequent the feature is, the bigger its + // weight. The weight of a FeatureVec is a sum of individual feature weights. + double ComputeRarityWeight(const FeatureVec& features) const; // Returns a debug string representing the state of *this. std::string DebugString() const; diff --git a/centipede/feature_set_test.cc b/centipede/feature_set_test.cc index 9a81d2d73..eb954397d 100644 --- a/centipede/feature_set_test.cc +++ b/centipede/feature_set_test.cc @@ -27,8 +27,8 @@ namespace { TEST(FeatureSet, ComputeWeight) { FeatureSet feature_set(10, {}); - auto W = [&](const FeatureVec &features) -> uint64_t { - return feature_set.ComputeWeight(features); + auto W = [&](const FeatureVec& features) -> uint64_t { + return feature_set.ComputeRarityWeight(features); }; feature_set.MergeFeatures({1, 2, 3}); @@ -60,8 +60,8 @@ TEST(FeatureSet, ComputeWeightWithDifferentDomains) { /* three features from domain #3 */ f3, f3 + 1, f3 + 2}); - auto weight = [&](const FeatureVec &features) -> uint64_t { - return feature_set.ComputeWeight(features); + auto weight = [&](const FeatureVec& features) -> uint64_t { + return feature_set.ComputeRarityWeight(features); }; // Test that features from a less frequent domain have more weight.