bloom/bloom.go at master · kok-stack/bloom · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
package bloom

import (
	"errors"
	"fmt"
	"math"
	"sync"

	"github.com/spaolacci/murmur3"
)

const (
	mod7       = 1<<3 - 1
	bitPerByte = 8
)

// Filter is the sturct of BloomFilter
// false positive error rate p approximately (1 - e^(-kn/m))^k
// probability of false positives decreases as m increases, and increases as n increases.
// k is number of hash function,
// m is the size of filter, n is the number of elements inserted
type Filter struct {
	lock       *sync.RWMutex
	concurrent bool

	m     uint64 // bit array of m bits, m will be ceiling to power of 2
	n     uint64 // number of inserted elements
	log2m uint64 // log_2 of m
	k     uint64 // the number of hash function
	keys  []byte // byte array to store hash value
}

// New is function of creating a bloom filter
// k is number of hash function,
// m is the size of filter
// race is sync or not
func New(size uint64, k uint64, race bool) *Filter {
	log2 := uint64(math.Ceil(math.Log2(float64(size))))
	filter := &Filter{
		m:          1 << log2,
		log2m:      log2,
		k:          k,
		keys:       make([]byte, 1<<log2),
		concurrent: race,
	}
	if filter.concurrent {
		filter.lock = &sync.RWMutex{}
	}
	return filter
}

func NewWithExistData(existData []byte, k uint64, race bool) *Filter {
	log2 := uint64(math.Ceil(math.Log2(float64(len(existData)))))
	filter := &Filter{
		m:          1 << log2,
		log2m:      log2,
		k:          k,
		keys:       existData,
		concurrent: race,
	}
	if filter.concurrent {
		filter.lock = &sync.RWMutex{}
	}
	return filter
}

// GetData return byte array
func (f *Filter) GetData() []byte {
	if f.concurrent {
		f.lock.Lock()
		defer f.lock.Unlock()
	}

	return f.keys
}

// Add adds byte array to bloom filter
func (f *Filter) Add(data []byte) *Filter {
	if f.concurrent {
		f.lock.Lock()
		defer f.lock.Unlock()
	}
	h := baseHash(data)
	for i := uint64(0); i < f.k; i++ {
		loc := location(h, i)
		slot, mod := f.location(loc)
		f.keys[slot] |= 1 << mod
	}
	f.n++
	return f
}

// Test check if byte array may exist in bloom filter
func (f *Filter) Test(data []byte) bool {
	if f.concurrent {
		f.lock.RLock()
		defer f.lock.RUnlock()
	}
	h := baseHash(data)
	for i := uint64(0); i < f.k; i++ {
		loc := location(h, i)
		slot, mod := f.location(loc)
		if f.keys[slot]&(1<<mod) == 0 {
			return false
		}
	}
	return true
}

// AddString adds string to filter
func (f *Filter) AddString(s string) *Filter {
	data := str2Bytes(s)
	return f.Add(data)
}

// TestString if string may exist in filter
func (f *Filter) TestString(s string) bool {
	data := str2Bytes(s)
	return f.Test(data)
}

// AddUInt16 adds uint16 to filter
func (f *Filter) AddUInt16(num uint16) *Filter {
	data := uint16ToBytes(num)
	return f.Add(data)
}

// TestUInt16 checks if uint16 is in filter
func (f *Filter) TestUInt16(num uint16) bool {
	data := uint16ToBytes(num)
	return f.Test(data)
}

// AddUInt32 adds uint32 to filter
func (f *Filter) AddUInt32(num uint32) *Filter {
	data := uint32ToBytes(num)
	return f.Add(data)
}

// TestUInt32 checks if uint32 is in filter
func (f *Filter) TestUInt32(num uint32) bool {
	data := uint32ToBytes(num)
	return f.Test(data)
}

// AddUInt64 adds uint64 to filter
func (f *Filter) AddUInt64(num uint64) *Filter {
	data := uint64ToBytes(num)
	return f.Add(data)
}

// TestUInt64 checks if uint64 is in filter
func (f *Filter) TestUInt64(num uint64) bool {
	data := uint64ToBytes(num)
	return f.Test(data)
}

// AddBatch add data array
func (f *Filter) AddBatch(dataArr [][]byte) *Filter {
	if f.concurrent {
		f.lock.Lock()
		defer f.lock.Unlock()
	}
	for i := 0; i < len(dataArr); i++ {
		data := dataArr[i]
		h := baseHash(data)
		for i := uint64(0); i < f.k; i++ {
			loc := location(h, i)
			slot, mod := f.location(loc)
			f.keys[slot] |= 1 << mod
		}
		f.n++
	}
	return f
}

// AddUint16Batch adds uint16 array
func (f *Filter) AddUint16Batch(numArr []uint16) *Filter {
	data := make([][]byte, 0, len(numArr))
	for i := 0; i < len(numArr); i++ {
		byteArr := uint16ToBytes(numArr[i])
		data = append(data, byteArr)
	}
	return f.AddBatch(data)
}

// AddUint32Batch adds uint32 array
func (f *Filter) AddUint32Batch(numArr []uint32) *Filter {
	data := make([][]byte, 0, len(numArr))
	for i := 0; i < len(numArr); i++ {
		byteArr := uint32ToBytes(numArr[i])
		data = append(data, byteArr)
	}
	return f.AddBatch(data)
}

// AddUin64Batch  adds uint64 array
func (f *Filter) AddUin64Batch(numArr []uint64) *Filter {
	data := make([][]byte, 0, len(numArr))
	for i := 0; i < len(numArr); i++ {
		byteArr := uint64ToBytes(numArr[i])
		data = append(data, byteArr)
	}
	return f.AddBatch(data)
}

// location returns the bit position in byte array
// & (f.m - 1) is the quick way for mod operation
func (f *Filter) location(h uint64) (uint64, uint64) {
	slot := (h / bitPerByte) & (f.m - 1)
	mod := h & mod7
	return slot, mod
}

// location returns the ith hashed location using the four base hash values
func location(h []uint64, i uint64) uint64 {
	// return h[ii%2] + ii*h[2+(((ii+(ii%2))%4)/2)]
	return h[i&1] + i*h[2+(((i+(i&1))&3)/2)]
}

// baseHash returns the murmur3 128-bit hash
func baseHash(data []byte) []uint64 {
	a1 := []byte{1} // to grab another bit of data
	hasher := murmur3.New128()
	hasher.Write(data) // #nosec
	v1, v2 := hasher.Sum128()
	hasher.Write(a1) // #nosec
	v3, v4 := hasher.Sum128()
	return []uint64{
		v1, v2, v3, v4,
	}
}

// Reset reset the bits to zero used in filter
func (f *Filter) Reset() {
	if f.concurrent {
		f.lock.Lock()
		defer f.lock.Unlock()
	}
	for i := 0; i < len(f.keys); i++ {
		f.keys[i] &= 0
	}
	f.n = 0
}

// MergeInPlace merges another filter into current one
func (f *Filter) MergeInPlace(g *Filter) error {
	if f.m != g.m {
		return fmt.Errorf("m's don't match: %d != %d", f.m, g.m)
	}

	if f.k != g.k {
		return fmt.Errorf("k's don't match: %d != %d", f.m, g.m)
	}
	if g.concurrent {
		return errors.New("merging concurrent filter is not support")
	}

	if f.concurrent {
		f.lock.Lock()
		defer f.lock.Unlock()
	}
	for i := 0; i < len(f.keys); i++ {
		f.keys[i] |= g.keys[i]
	}
	return nil
}

// Cap return the size of bits
func (f *Filter) Cap() uint64 {
	if f.concurrent {
		f.lock.RLock()
		defer f.lock.RUnlock()
	}
	return f.m
}

// KeySize return  count of inserted element
func (f *Filter) KeySize() uint64 {
	if f.concurrent {
		f.lock.RLock()
		defer f.lock.RUnlock()
	}
	return f.n
}

// FalsePositiveRate returns (1 - e^(-kn/m))^k
func (f *Filter) FalsePositiveRate() float64 {
	if f.concurrent {
		f.lock.RLock()
		defer f.lock.RUnlock()
	}
	expoInner := -(float64)(f.k*f.n) / float64(f.m)
	rate := math.Pow(1-math.Pow(math.E, expoInner), float64(f.k))
	return rate
}