1 /** 2 * A bloom filter implementation 3 * 4 * Copyright: © 2013 - $(YEAR) Martin Nowak 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Martin Nowak 7 * 8 */ 9 module bloom; 10 import core.bitop, std.typecons; 11 12 /** 13 * A bloom filter is a fast and space-efficient probabilistic data 14 * structure to test whether an element is member of a set. 15 * False positive matches are possible, false negative matches are not. 16 * Elements can only be added not removed. 17 * 18 * Asymptotic false-positive rates for different BitsPerEntry settings. 19 * $(TABLE 20 * $(THEAD BitsPerEntry, False Positive Rate) 21 * $(TROW 1, 63.2%) 22 * $(TROW 2, 39.3%) 23 * $(TROW 3, 23.7%) 24 * $(TROW 4, 14.7% (default)) 25 * $(TROW 5, 9.18%) 26 * $(TROW 6, 5.61%) 27 * $(TROW 7, 3.47%) 28 * $(TROW 8, 2.15%) 29 * $(TROW 9, 1.33%) 30 * ) 31 * Params: 32 * BitsPerEntry = Set the number of bits allocated per entry (default 4). 33 * cheapHash = Use a faster but less good hash function. 34 * 35 * Macros: 36 * THEAD=$(TR $(THX $1, $+)) 37 * THX=$(TH $1)$(THX $+) 38 * TROW=$(TR $(TDX $1, $+)) 39 * TDX=$(TD $1)$(TDX $+) 40 */ 41 struct BloomFilter(size_t BitsPerEntry=4, CheapHash cheapHash=CheapHash.yes) if (BitsPerEntry > 0) 42 { 43 /// no copying 44 @disable this(this); 45 46 /// construct a Bloom filter optimized for nentries 47 this(size_t nentries) 48 { 49 resize(nentries); 50 } 51 52 /// insert a key 53 void insert(size_t key) 54 { 55 immutable hashes = hash(key); 56 foreach (i; SIota!K) 57 bts(_p, hashes[i]); 58 } 59 60 /// test membership of key 61 bool test(size_t key) const 62 { 63 immutable hashes = hash(key); 64 foreach (i; SIota!K) 65 if (!bt(_p, hashes[i])) return false; 66 return true; 67 } 68 69 ~this() 70 { 71 reset(); 72 } 73 74 /// free all bits 75 void reset() 76 { 77 import core.stdc.stdlib : free; 78 79 _size = 0; 80 free(_p); 81 _p = null; 82 } 83 84 /// clear all bits 85 void clear() 86 { 87 import core.stdc.string : memset; 88 89 memset(_p, 0, _size / 8); 90 } 91 92 /// get the reserved number of entries 93 size_t size() const 94 { 95 return _size / M_N; 96 } 97 98 /// resize to nentries, all bits are cleared 99 void resize(size_t nentries) 100 in { assert(nentries); } 101 body 102 { 103 import core.stdc.stdlib : realloc; 104 105 nentries = cast(size_t)1 << 1 + bsr(nentries - 1); // next pow 2 106 _size = M_N * nentries; 107 assert(_size >= 8); 108 _p = cast(size_t*)realloc(_p, _size / 8); 109 if (_size) clear(); 110 } 111 112 private: 113 // The optimal number of hash functions should be k = (m/n) * log(2) 114 // The asymptotic false positive rate is (1 - e^(-k*n/m))^k. 115 enum M_N = BitsPerEntry; 116 enum K = cast(size_t)(M_N * LN2 + 0.5); 117 enum real LN2 = 0x1.62e42fefa39ef35793c7673007e5fp-1L; /* ln 2 = 0.693147... */ 118 119 uint[K] hash(size_t key) const 120 { 121 uint[K] result = void; 122 123 static if (cheapHash) 124 { 125 const(ubyte)* p = cast(ubyte*)&key; 126 // rolling FNV-1a 127 enum offset_basis = 2166136261; 128 enum FNV_prime = 16777619; 129 130 uint hash = offset_basis; 131 foreach (i; SIota!(0, K)) 132 { 133 hash ^= p[i % key.sizeof]; 134 hash *= FNV_prime; 135 result[i] = hash; 136 } 137 } 138 else 139 { 140 static if (K >= 1) 141 { 142 // Hsieh's Superfast Hash 143 ulong h0 = key; 144 h0 ^= h0 << 3; 145 h0 += h0 >> 5; 146 h0 ^= h0 << 4; 147 h0 += h0 >> 17; 148 h0 ^= h0 << 25; 149 h0 += h0 >> 6; 150 result[0] = cast(uint)(h0 - (h0 >> 32)); 151 } 152 static if (K >= 2) 153 { 154 // Murmur Hash 3 155 ulong h1 = key; 156 h1 ^= h1 >> 16; 157 h1 *= 0x85ebca6b; 158 h1 ^= h1 >> 13; 159 h1 *= 0xc2b2ae35; 160 h1 ^= h1 >> 16; 161 result[1] = cast(uint)(h1 - (h1 >> 32)); 162 } 163 static if (K >= 3) 164 { 165 // fast-hash 166 ulong h2 = key; 167 h2 ^= h2 >> 23; 168 h2 *= 0x2127599bf4325c37UL; 169 h2 ^= h2 >> 47; 170 result[2] = cast(uint)(h2 - (h2 >> 32)); 171 } 172 static if (K > 3) 173 { 174 uint rol(uint n)(in uint x) 175 { 176 return x << n | x >> 32 - n; 177 } 178 // Bob Jenkins lookup3 final mix 179 uint h3 = result[0], h4 = result[1], h5 = result[2]; 180 h5 ^= h4; h5 -= rol!14(h4); 181 h3 ^= h5; h3 -= rol!11(h5); 182 h4 ^= h3; h4 -= rol!25(h3); 183 h5 ^= h4; h5 -= rol!16(h4); 184 h3 ^= h5; h3 -= rol!4(h5); 185 h4 ^= h3; h4 -= rol!14(h3); 186 h5 ^= h4; h5 -= rol!24(h4); 187 } 188 static if (K >= 4) result[3] = h3; 189 static if (K >= 5) result[4] = h4; 190 static if (K >= 6) result[5] = h5; 191 static assert(K <= 6, "Only 6 hash functions defined but "~K.stringof~" needed for optimal results."); 192 } 193 194 immutable mask = _size - 1; 195 foreach (i; SIota!(0, K)) 196 result[i] &= mask; 197 198 return result; 199 } 200 201 size_t _size; 202 size_t* _p; 203 } 204 205 /// Default BloomFilter for 16 entries. 206 unittest 207 { 208 auto filter = BloomFilter!()(16); 209 filter.insert(1); 210 assert(filter.test(1)); 211 assert(!filter.test(2)); 212 filter.insert(2); 213 assert(filter.test(2)); 214 } 215 216 /// Using 6 bits per entry. 217 unittest 218 { 219 auto filter = BloomFilter!6(16); 220 } 221 222 /// Using a better but slower hash function. 223 unittest 224 { 225 auto filter = BloomFilter!(4, CheapHash.no)(16); 226 } 227 228 deprecated("Use CheapHash.yes or CheapHash.no instead.") 229 template BloomFilter(size_t BitsPerEntry=4, bool cheapHash=true) if (BitsPerEntry > 0) 230 { 231 alias BloomFilter = BloomFilter!(BitsPerEntry, cast(CheapHash)cheapHash); 232 } 233 234 unittest 235 { 236 import std.datetime.stopwatch, std.stdio; 237 238 static bool bench(size_t BitsPerEntry)(size_t nentries) 239 { 240 writeln("bench BitsPerEntry ", BitsPerEntry, " nentries ", nentries); 241 242 auto filter = BloomFilter!(4)(nentries); 243 244 auto sw = StopWatch(AutoStart.yes); 245 foreach (i; 0 .. nentries) 246 filter.insert(i); 247 writeln("insert took ", sw.peek.total!"usecs", " µs"); sw.reset(); 248 foreach (i; 0 .. nentries) 249 if (!filter.test(i)) return false; 250 writeln("test took ", sw.peek.total!"usecs", " µs"); sw.reset(); 251 return true; 252 } 253 254 assert(bench!(4)(1_000_000)); 255 assert(bench!(8)(1_000_000)); 256 } 257 258 /// $(LINK2 http://dlang.org/phobos/std_typecons.html#.Flag,Flag) to specify whether or not a cheaper hash is used. 259 alias CheapHash = Flag!"cheapHash"; 260 261 private: 262 263 template SIota(size_t start, size_t end) if (start <= end) 264 { 265 template TT(T...) { alias TT = T; } 266 static if (start < end) 267 alias SIota = TT!(start, SIota!(start + 1, end)); 268 else 269 alias SIota = TT!(); 270 } 271 272 template SIota(size_t n) 273 { 274 alias SIota = SIota!(0, n); 275 } 276 277 unittest 278 { 279 template TT(T...) { alias TT = T; } 280 enum N = 500; 281 foreach (cheapHash; TT!(CheapHash.yes, CheapHash.no)) 282 { 283 foreach (K; SIota!(1, 10)) 284 { 285 auto filter = BloomFilter!(K, cheapHash)(N); 286 assert(filter.size == 512); 287 auto p = cast(size_t)&filter / 4096; 288 filter.insert(p); 289 assert(filter.test(p)); 290 291 foreach (i; 0 .. N) 292 { 293 filter.insert(p + i); 294 assert(filter.test(p + i)); 295 } 296 filter.clear(); 297 foreach (i; 0 .. N) 298 assert(!filter.test(p + i)); 299 } 300 } 301 }