From cec377dc3505b6a688a5c0974d4d100543b458f2 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Mon, 11 Apr 2022 09:29:23 -0700 Subject: [PATCH] Add support for "SHA256CTR" filters --- filtercascade/__init__.py | 62 ++++++++++++++++++++++++----- filtercascade/fileformats.py | 6 +++ filtercascade/test_filtercascade.py | 26 ++++++++---- 3 files changed, 77 insertions(+), 17 deletions(-) diff --git a/filtercascade/__init__.py b/filtercascade/__init__.py index 3190e6b..f9ad1ca 100644 --- a/filtercascade/__init__.py +++ b/filtercascade/__init__.py @@ -15,6 +15,10 @@ log = logging.getLogger(__name__) +def byte_length(bit_length): + return (bit_length + 7) // 8 + + class InvertedLogicException(Exception): def __init__(self, *, depth, exclude_count, include_len): self.message = ( @@ -54,12 +58,14 @@ def __init__( nHashFuncs, level, hashAlg=fileformats.HashAlgorithm.MURMUR3, + hashOffset=0, salt=None, ): self.nHashFuncs = nHashFuncs self.size = size self.level = level self.hashAlg = fileformats.HashAlgorithm(hashAlg) + self.hashOffset = hashOffset self.salt = salt self.bitarray = bitarray.bitarray(self.size, endian="little") @@ -99,6 +105,23 @@ def hash(self, *, hash_no, key): ) return h + if self.hashAlg == fileformats.HashAlgorithm.SHA256CTR: + b = [] + bytes_needed = byte_length(self.size.bit_length()) + offset = self.hashOffset + hash_no * bytes_needed + while len(b) < bytes_needed: + m = hashlib.sha256() + m.update(fileformats.bloomer_sha256ctr_hash_struct.pack(offset // 32)) + m.update(self.salt) + m.update(key) + digest = m.digest() + i = offset % 32 + x = digest[i : i + bytes_needed - len(b)] + b.extend(x) + offset += len(x) + h = int.from_bytes(b, byteorder="little", signed=False) % self.size + return h + raise Exception(f"Unknown hash algorithm: {self.hashAlg}") def add(self, key): @@ -136,13 +159,19 @@ def filter_with_characteristics( elements, falsePositiveRate, hashAlg=fileformats.HashAlgorithm.MURMUR3, + hashOffset=0, salt=None, level=1, ): nHashFuncs = Bloomer.calc_n_hashes(falsePositiveRate) size = Bloomer.calc_size(nHashFuncs, elements, falsePositiveRate) return Bloomer( - size=size, nHashFuncs=nHashFuncs, level=level, hashAlg=hashAlg, salt=salt + size=size, + nHashFuncs=nHashFuncs, + level=level, + hashAlg=hashAlg, + hashOffset=hashOffset, + salt=salt, ) @classmethod @@ -161,7 +190,7 @@ def calc_size(cls, nHashFuncs, elements, falsePositiveRate): min_bits = math.ceil(1.44 * elements * math.log2(1 / falsePositiveRate)) assert min_bits > 0, "Always must have a positive number of bits" # Ensure the result is divisible by 8 for full bytes - return 8 * math.ceil(min_bits / 8) + return 8 * byte_length(min_bits) @classmethod def from_buf(cls, buf, salt=None): @@ -206,10 +235,10 @@ def __init__( invertedLogic=None, ): """ - Construct a FilterCascade. - error_rates: If not supplied, defaults will be calculated - invertedLogic: If not supplied (or left as None), it will be auto- - detected. + Construct a FilterCascade. + error_rates: If not supplied, defaults will be calculated + invertedLogic: If not supplied (or left as None), it will be auto- + detected. """ self.filters = filters or [] self.growth_factor = growth_factor @@ -250,10 +279,10 @@ def set_crlite_error_rates(self, *, include_len, exclude_len): def initialize(self, *, include, exclude): """ - Arg "exclude" is potentially larger than main memory, so it should - be assumed to be passed as a lazy-loading iterator. If it isn't, - that's fine. The "include" arg must fit in memory and should be - assumed to be a set. + Arg "exclude" is potentially larger than main memory, so it should + be assumed to be passed as a lazy-loading iterator. If it isn't, + that's fine. The "include" arg must fit in memory and should be + assumed to be a set. """ try: iter(exclude) @@ -286,6 +315,13 @@ def initialize(self, *, include, exclude): er = self.error_rates[depth - 1] if depth > len(self.filters): + if len(self.filters) == 0: + hashOffset = 0 + else: + prev = self.filters[-1] + hashOffset = prev.hashOffset + prev.nHashFuncs * byte_length( + prev.size.bit_length() + ) self.filters.append( Bloomer.filter_with_characteristics( elements=max( @@ -296,10 +332,15 @@ def initialize(self, *, include, exclude): falsePositiveRate=er, level=depth, hashAlg=self.defaultHashAlg, + hashOffset=hashOffset, ) ) else: # Filter already created for this layer. Check size and resize if needed. + prev = self.filters[depth - 1] + hashOffset = prev.hashOffset + prev.nHashFuncs * byte_length( + prev.size.bit_length() + ) required_size = Bloomer.calc_size( self.filters[depth - 1].nHashFuncs, include_len, er ) @@ -310,6 +351,7 @@ def initialize(self, *, include, exclude): falsePositiveRate=er, level=depth, hashAlg=self.defaultHashAlg, + hashOffset=hashOffset, ) log.info( f"Resized filter at {depth}-depth layer to {self.filters[depth - 1].size}" diff --git a/filtercascade/fileformats.py b/filtercascade/fileformats.py index ca70623..aee5aca 100644 --- a/filtercascade/fileformats.py +++ b/filtercascade/fileformats.py @@ -7,6 +7,7 @@ class HashAlgorithm(IntEnum): MURMUR3 = 1 SHA256 = 2 + SHA256CTR = 3 # The header for each Bloom filter level @@ -24,6 +25,11 @@ class HashAlgorithm(IntEnum): # byte 4: layer number of this bloom filter, as an unsigned char bloomer_sha256_hash_struct = struct.Struct(b"