1 /**
2  * A bloom filter implementation
3  *
4  * Copyright: © 2013 - $(YEAR) Martin Nowak
5  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6  * Authors: Martin Nowak
7  *
8  */
9 module bloom;
10 import core.bitop, std.typecons;
11 
12 /**
13  * A bloom filter is a fast and space-efficient probabilistic data
14  * structure to test whether an element is member of a set.
15  * False positive matches are possible, false negative matches are not.
16  * Elements can only be added not removed.
17  *
18  * Asymptotic false-positive rates for different BitsPerEntry settings.
19  * $(TABLE
20  *   $(THEAD BitsPerEntry, False Positive Rate)
21  *   $(TROW 1, 63.2%)
22  *   $(TROW 2, 39.3%)
23  *   $(TROW 3, 23.7%)
24  *   $(TROW 4, 14.7% (default))
25  *   $(TROW 5, 9.18%)
26  *   $(TROW 6, 5.61%)
27  *   $(TROW 7, 3.47%)
28  *   $(TROW 8, 2.15%)
29  *   $(TROW 9, 1.33%)
30  * )
31  * Params:
32  *  BitsPerEntry = Set the number of bits allocated per entry (default 4).
33  *  cheapHash = Use a faster but less good hash function.
34  *
35  * Macros:
36  *   THEAD=$(TR $(THX $1, $+))
37  *   THX=$(TH $1)$(THX $+)
38  *   TROW=$(TR $(TDX $1, $+))
39  *   TDX=$(TD $1)$(TDX $+)
40  */
41 struct BloomFilter(size_t BitsPerEntry=4, CheapHash cheapHash=CheapHash.yes) if (BitsPerEntry > 0)
42 {
43     /// no copying
44     @disable this(this);
45 
46     /// construct a Bloom filter optimized for nentries
47     this(size_t nentries)
48     {
49         resize(nentries);
50     }
51 
52     /// insert a key
53     void insert(size_t key)
54     {
55         immutable hashes = hash(key);
56         foreach (i; SIota!K)
57             bts(_p, hashes[i]);
58     }
59 
60     /// test membership of key
61     bool test(size_t key) const
62     {
63         immutable hashes = hash(key);
64         foreach (i; SIota!K)
65             if (!bt(_p, hashes[i])) return false;
66         return true;
67     }
68 
69     ~this()
70     {
71         reset();
72     }
73 
74     /// free all bits
75     void reset()
76     {
77         import core.stdc.stdlib : free;
78 
79         _size = 0;
80         free(_p);
81         _p = null;
82     }
83 
84     /// clear all bits
85     void clear()
86     {
87         import core.stdc.string : memset;
88 
89         memset(_p, 0, _size / 8);
90     }
91 
92     /// get the reserved number of entries
93     size_t size() const
94     {
95         return _size / M_N;
96     }
97 
98     /// resize to nentries, all bits are cleared
99     void resize(size_t nentries)
100     in { assert(nentries); }
101     body
102     {
103         import core.stdc.stdlib : realloc;
104 
105         nentries = cast(size_t)1 << 1 + bsr(nentries - 1); // next pow 2
106         _size = M_N * nentries;
107         assert(_size >= 8);
108         _p = cast(size_t*)realloc(_p, _size / 8);
109         if (_size) clear();
110     }
111 
112 private:
113     // The optimal number of hash functions should be k = (m/n) * log(2)
114     // The asymptotic false positive rate is (1 - e^(-k*n/m))^k.
115     enum M_N = BitsPerEntry;
116     enum K = cast(size_t)(M_N * LN2 + 0.5);
117     enum real LN2 = 0x1.62e42fefa39ef35793c7673007e5fp-1L; /* ln 2  = 0.693147... */
118 
119     uint[K] hash(size_t key) const
120     {
121         uint[K] result = void;
122 
123         static if (cheapHash)
124         {
125             const(ubyte)* p = cast(ubyte*)&key;
126             // rolling FNV-1a
127             enum offset_basis = 2166136261;
128             enum FNV_prime = 16777619;
129 
130             uint hash = offset_basis;
131             foreach (i; SIota!(0, K))
132             {
133                 hash ^= p[i % key.sizeof];
134                 hash *= FNV_prime;
135                 result[i] = hash;
136             }
137         }
138         else
139         {
140             static if (K >= 1)
141             {
142                 // Hsieh's Superfast Hash
143                 ulong h0 = key;
144                 h0 ^= h0 << 3;
145                 h0 += h0 >> 5;
146                 h0 ^= h0 << 4;
147                 h0 += h0 >> 17;
148                 h0 ^= h0 << 25;
149                 h0 += h0 >> 6;
150                 result[0] = cast(uint)(h0 - (h0 >> 32));
151             }
152             static if (K >= 2)
153             {
154                 // Murmur Hash 3
155                 ulong h1 = key;
156                 h1 ^= h1 >> 16;
157                 h1 *= 0x85ebca6b;
158                 h1 ^= h1 >> 13;
159                 h1 *= 0xc2b2ae35;
160                 h1 ^= h1 >> 16;
161                 result[1] = cast(uint)(h1 - (h1 >> 32));
162             }
163             static if (K >= 3)
164             {
165                 // fast-hash
166                 ulong h2 = key;
167                 h2 ^= h2 >> 23;
168                 h2 *= 0x2127599bf4325c37UL;
169                 h2 ^= h2 >> 47;
170                 result[2] = cast(uint)(h2 - (h2 >> 32));
171             }
172             static if (K > 3)
173             {
174                 uint rol(uint n)(in uint x)
175                 {
176                     return x << n | x >> 32 - n;
177                 }
178                 // Bob Jenkins lookup3 final mix
179                 uint h3 = result[0], h4 = result[1], h5 = result[2];
180                 h5 ^= h4; h5 -= rol!14(h4);
181                 h3 ^= h5; h3 -= rol!11(h5);
182                 h4 ^= h3; h4 -= rol!25(h3);
183                 h5 ^= h4; h5 -= rol!16(h4);
184                 h3 ^= h5; h3 -= rol!4(h5);
185                 h4 ^= h3; h4 -= rol!14(h3);
186                 h5 ^= h4; h5 -= rol!24(h4);
187             }
188             static if (K >= 4) result[3] = h3;
189             static if (K >= 5) result[4] = h4;
190             static if (K >= 6) result[5] = h5;
191             static assert(K <= 6, "Only 6 hash functions defined but "~K.stringof~" needed for optimal results.");
192         }
193 
194         immutable mask = _size - 1;
195         foreach (i; SIota!(0, K))
196             result[i] &= mask;
197 
198         return result;
199     }
200 
201     size_t _size;
202     size_t* _p;
203 }
204 
205 /// Default BloomFilter for 16 entries.
206 unittest
207 {
208     auto filter = BloomFilter!()(16);
209     filter.insert(1);
210     assert(filter.test(1));
211     assert(!filter.test(2));
212     filter.insert(2);
213     assert(filter.test(2));
214 }
215 
216 /// Using 6 bits per entry.
217 unittest
218 {
219     auto filter = BloomFilter!6(16);
220 }
221 
222 /// Using a better but slower hash function.
223 unittest
224 {
225     auto filter = BloomFilter!(4, CheapHash.no)(16);
226 }
227 
228 deprecated("Use CheapHash.yes or CheapHash.no instead.")
229 template BloomFilter(size_t BitsPerEntry=4, bool cheapHash=true) if (BitsPerEntry > 0)
230 {
231     alias BloomFilter = BloomFilter!(BitsPerEntry, cast(CheapHash)cheapHash);
232 }
233 
234 unittest
235 {
236     import std.datetime.stopwatch, std.stdio;
237 
238     static bool bench(size_t BitsPerEntry)(size_t nentries)
239     {
240         writeln("bench BitsPerEntry ", BitsPerEntry, " nentries ", nentries);
241 
242         auto filter = BloomFilter!(4)(nentries);
243 
244         auto sw = StopWatch(AutoStart.yes);
245         foreach (i; 0 .. nentries)
246             filter.insert(i);
247         writeln("insert took ", sw.peek.total!"usecs", " µs"); sw.reset();
248         foreach (i; 0 .. nentries)
249             if (!filter.test(i)) return false;
250         writeln("test took ", sw.peek.total!"usecs", " µs"); sw.reset();
251         return true;
252     }
253 
254     assert(bench!(4)(1_000_000));
255     assert(bench!(8)(1_000_000));
256 }
257 
258 /// $(LINK2 http://dlang.org/phobos/std_typecons.html#.Flag,Flag) to specify whether or not a cheaper hash is used.
259 alias CheapHash = Flag!"cheapHash";
260 
261 private:
262 
263 template SIota(size_t start, size_t end) if (start <= end)
264 {
265     template TT(T...) { alias TT = T; }
266     static if (start < end)
267         alias SIota = TT!(start, SIota!(start + 1, end));
268     else
269         alias SIota = TT!();
270 }
271 
272 template SIota(size_t n)
273 {
274     alias SIota = SIota!(0, n);
275 }
276 
277 unittest
278 {
279     template TT(T...) { alias TT = T; }
280     enum N = 500;
281     foreach (cheapHash; TT!(CheapHash.yes, CheapHash.no))
282     {
283         foreach (K; SIota!(1, 10))
284         {
285             auto filter = BloomFilter!(K, cheapHash)(N);
286             assert(filter.size == 512);
287             auto p = cast(size_t)&filter / 4096;
288             filter.insert(p);
289             assert(filter.test(p));
290 
291             foreach (i; 0 .. N)
292             {
293                 filter.insert(p + i);
294                 assert(filter.test(p + i));
295             }
296             filter.clear();
297             foreach (i; 0 .. N)
298                 assert(!filter.test(p + i));
299         }
300     }
301 }