Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion count/include/count_min.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ class count_min_sketch{
std::vector<W, Allocator> _sketch_array; // the array stored by the sketch
uint64_t _seed;
W _total_weight;
std::vector<uint64_t> hash_seeds;
std::vector<uint64_t> _hash_seeds;

enum flags {IS_EMPTY};
static const uint8_t PREAMBLE_LONGS_SHORT = 2; // Empty -> need second byte for sketch parameters
Expand Down
45 changes: 32 additions & 13 deletions count/include/count_min_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ _num_buckets(num_buckets),
_sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator),
_seed(seed),
_total_weight(0) {
if (num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.");
if (num_buckets < 3) {
throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.");
}

// This check is to ensure later compatibility with a Java implementation whose maximum size can only
// be 2^31-1. We check only against 2^30 for simplicity.
Expand All @@ -50,10 +52,10 @@ _total_weight(0) {

std::default_random_engine rng(_seed);
std::uniform_int_distribution<uint64_t> extra_hash_seeds(0, std::numeric_limits<uint64_t>::max());
hash_seeds.reserve(num_hashes);
_hash_seeds.reserve(num_hashes);

for (uint64_t i=0; i < num_hashes; ++i) {
hash_seeds.push_back(extra_hash_seeds(rng) + _seed); // Adds the global seed to all hash functions.
_hash_seeds.push_back(extra_hash_seeds(rng) + _seed); // Adds the global seed to all hash functions.
}
}

Expand Down Expand Up @@ -128,7 +130,7 @@ std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t
sketch_update_locations.reserve(_num_hashes);

uint64_t hash_seed_index = 0;
for (const auto &it: hash_seeds) {
for (const auto &it: _hash_seeds) {
HashState hashes;
MurmurHash3_x64_128(item, size, it, hashes); // ? BEWARE OVERFLOW.
uint64_t hash = hashes.h1;
Expand All @@ -147,7 +149,9 @@ W count_min_sketch<W,A>::get_estimate(int64_t item) const {return get_estimate(&

template<typename W, typename A>
W count_min_sketch<W,A>::get_estimate(const std::string& item) const {
if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
if (item.empty()) {
return 0; // Empty strings are not inserted into the sketch.
}
return get_estimate(item.c_str(), item.length());
}

Expand Down Expand Up @@ -176,7 +180,9 @@ void count_min_sketch<W,A>::update(int64_t item, W weight) {

template<typename W, typename A>
void count_min_sketch<W,A>::update(const std::string& item, W weight) {
if (item.empty()) return;
if (item.empty()) {
return;
}
update(item.c_str(), item.length(), weight);
}

Expand All @@ -201,7 +207,9 @@ W count_min_sketch<W,A>::get_upper_bound(int64_t item) const {return get_upper_b

template<typename W, typename A>
W count_min_sketch<W,A>::get_upper_bound(const std::string& item) const {
if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
if (item.empty()) {
return 0; // Empty strings are not inserted into the sketch.
}
return get_upper_bound(item.c_str(), item.length());
}

Expand All @@ -218,7 +226,9 @@ W count_min_sketch<W,A>::get_lower_bound(int64_t item) const {return get_lower_b

template<typename W, typename A>
W count_min_sketch<W,A>::get_lower_bound(const std::string& item) const {
if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
if (item.empty()) {
return 0; // Empty strings are not inserted into the sketch.
}
return get_lower_bound(item.c_str(), item.length());
}

Expand Down Expand Up @@ -290,7 +300,9 @@ void count_min_sketch<W,A>::serialize(std::ostream& os) const {
write(os, nhashes);
write(os, seed_hash);
write(os, unused8);
if (is_empty()) return; // sketch is empty, no need to write further bytes.
if (is_empty()) {
return; // sketch is empty, no need to write further bytes.
}

// Long 2
write(os, _total_weight);
Expand Down Expand Up @@ -327,7 +339,9 @@ auto count_min_sketch<W,A>::deserialize(std::istream& is, uint64_t seed, const A
}
count_min_sketch c(nhashes, nbuckets, seed, allocator);
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
if (is_empty == 1) return c; // sketch is empty, no need to read further.
if (is_empty == 1) {
return c; // sketch is empty, no need to read further.
}

// Set the sketch weight and read in the sketch values
const auto weight = read<W>(is);
Expand Down Expand Up @@ -373,7 +387,9 @@ auto count_min_sketch<W,A>::serialize(unsigned header_size_bytes) const -> vecto
ptr += copy_to_mem(nhashes, ptr);
ptr += copy_to_mem(seed_hash, ptr);
ptr += copy_to_mem(null_characters_8, ptr);
if (is_empty()) return bytes; // sketch is empty, no need to write further bytes.
if (is_empty()) {
return bytes; // sketch is empty, no need to write further bytes.
}

// Long 2
const W t_weight = _total_weight;
Expand Down Expand Up @@ -423,7 +439,9 @@ auto count_min_sketch<W,A>::deserialize(const void* bytes, size_t size, uint64_t
}
count_min_sketch c(nhashes, nbuckets, seed, allocator);
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
if (is_empty) return c; // sketch is empty, no need to read further.
if (is_empty) {
return c; // sketch is empty, no need to read further.
}

ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes));

Expand All @@ -449,8 +467,9 @@ string<A> count_min_sketch<W,A>::to_string() const {
// count the number of used entries in the sketch
uint64_t num_nonzero = 0;
for (const auto entry: _sketch_array) {
if (entry != static_cast<W>(0.0))
if (entry != static_cast<W>(0.0)) {
++num_nonzero;
}
}

// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
Expand Down