-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathBloomSet.h
104 lines (84 loc) · 2.9 KB
/
BloomSet.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#ifndef BLOOMSET_H
#define BLOOMSET_H
#include <strings.h>
#include <iostream>
#include <string>
class BloomSet {
public:
BloomSet(uint64_t expectedElements) {
if(expectedElements == 0) expectedElements = 2;
// let's aim for ~0.0001 probability
bits = expectedElements * 20;
keybits = 7 * bits / expectedElements / 10;
data = new unsigned char[bits / 8 + 1];
bzero(data, bits / 8 + 1);
}
~BloomSet() {
delete[] data;
}
bool contains(const char *str, const size_t n) { return countBits(str, n) >= keybits; }
bool contains(const std::string &s) { return contains(s.c_str(), s.length()); }
// return true if the element already existed
bool insert(const char *str, const size_t n) { return setBits(str, n) >= keybits; }
bool insert(const std::string &s) { return insert(s.c_str(), s.length()); }
int estimateFill() {
int fill = 0;
unsigned int max = 256;
if(bits / 8 < max) max = bits / 8;
for(unsigned int i = 0; i < max; ++i) fill += data[i] & 1;
return (fill * 1000) / max;
}
private:
uint64_t bits;
uint64_t keybits;
unsigned char *data;
unsigned int setBits(const char *str, const size_t n) {
unsigned int count = 0;
uint64_t index = 0;
for(unsigned int i = 0; i < keybits; ++i) {
index = nextBit(index, str, n, i);
count +=!! (data[(index % bits) / 8] & (1 << ((index % bits) % 8)));
data[(index % bits) / 8] |= 1 << ((index % bits) % 8);
}
return count;
}
unsigned int countBits(const char *str, const size_t n) {
unsigned int count = 0;
uint64_t index = 0;
for(unsigned int i = 0; i < keybits; ++i) {
index = nextBit(index, str, n, i);
count +=!! (data[(index % bits) / 8] & (1 << ((index % bits) % 8)));
}
return count;
}
uint64_t nextBit(uint64_t lastBit, const char *str, const size_t n, char bitNo) {
uint64_t magic[] = {
0xF567789806898063ull,
0x4567779816798165ull,
0xC567769826698267ull,
0x4567759836598369ull,
0xD567749846498461ull,
0x4567739856398563ull,
0xE567729866298667ull,
0x4567719876198765ull,
};
const char *e = str + n;
while(str < e - 7) {
lastBit = (lastBit ^ 17) + *reinterpret_cast<const uint64_t *>(str) * magic[bitNo % 8] + (lastBit >> 8);
str += 8;
}
if(str < e - 3) {
lastBit = (lastBit ^ 17) + *reinterpret_cast<const uint32_t *>(str) * magic[bitNo % 8] + (lastBit >> 8);
str += 4;
}
if(str < e - 1) {
lastBit = (lastBit ^ 17) + *reinterpret_cast<const uint16_t *>(str) * magic[bitNo % 8] + (lastBit >> 8);
str += 2;
}
if(str < e) {
lastBit = (lastBit ^ 17) + *reinterpret_cast<const uint8_t *>(str) * magic[bitNo % 8] + (lastBit >> 8);
}
return lastBit;
}
};
#endif