-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathloglog.py
More file actions
35 lines (30 loc) · 1.1 KB
/
loglog.py
File metadata and controls
35 lines (30 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# This code is taken from Nick Johnson's Blog post about Cardinality Estimation
# http://blog.notdot.net/2012/09/Dam-Cool-Algorithms-Cardinality-Estimation
import hashlib
def hash(x):
x = str(x)
h = hashlib.sha1(x)
hex = h.hexdigest()
return int(hex, base=16)
def trailing_zeroes(num):
"""Counts the number of trailing 0 bits in num."""
if num == 0:
return 32 # Assumes 32 bit integer inputs!
p = 0
while (num >> p) & 1 == 0:
p += 1
return p
def loglog(values, k=10):
"""Estimates the number of unique elements in the input set values.
Arguments:
values: An iterator of hashable elements to estimate the cardinality of.
k: The number of bits of hash to use as a bucket number; there will be 2**k buckets.
"""
num_buckets = 2 ** k
max_zeroes = [0] * num_buckets
for value in values:
h = hash(value)
bucket = h & (num_buckets - 1) # Mask out the k least significant bits as bucket ID
bucket_hash = h >> k
max_zeroes[bucket] = max(max_zeroes[bucket], trailing_zeroes(bucket_hash))
return 2 ** (float(sum(max_zeroes)) / num_buckets) * num_buckets * 0.79402