Bloom Filter is a data structure to check whether an element is in the set or not. A query returns either "possibly in set"(false positive) or "definitely not in set". It can be used if a certain rate of error is allowed. (e.g. check if an url has been visited in web crawling). Check for more details.
Bloom Filter is helpful for
Checking if a key is duplicate
Retrieving the key from dictionary
Retrieving the intersection of data
2. False Positive Rate
False Positive Rate is related to:
(1) The number of bits (the more the better)
(2) The number of hash functions (the more the better)
(3) The number of elements inserted (the less the better)
3. Implementation
import java.util.BitSet;
public class BloomFilter {
// Number of bits
private static final int DEFAULT_SIZE = 1 << 25;
// Seeds for different hash functions, seed is prime number
private static final int[] seeds = {5, 7, 11, 13, 31, 37, 61};
// Bit Map
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash hashFunctions[] = new SimpleHash[seeds.length];
public BloomFilter() {
// Initialize Hash Functions for Bloom Filter
for (int i = 0; i < seeds.length; i++) {
hashFunctions[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]);
}
}
public void add(String key) {
for (SimpleHash function : hashFunctions) {
bits.set(function.hash(key), true);
}
}
public boolean contains(String key) {
boolean res = true;
for (SimpleHash function : hashFunctions) {
res = res && bits.get(function.hash(key));
}
return res;
}
class SimpleHash {
int capacity;
int seed;
public SimpleHash(int capacity, int seed) {
this.capacity = capacity;
this.seed = seed;
}
public int hash(String key) {
int res = 0;
for (int i = 0; i < key.length(); i++) {
res = res * seed + key.charAt(i);
}
return (capacity - 1) & res;
}
}
}