...
场景: 在10亿数据中判断某个数据是否存在
如果使用HashSet/HashMap来实现的话
查找的时间复杂度是O(1),但是我们来算一下存储空间,Hash值为Integer类型,占四个字节,那10亿条数据占用的空间就是:10亿*4/1024/1024/1024约等于3.7G......这个实现方案很明显不现实
如果使用布隆过滤器实现
占用的空间大约为2147483647/8/1024/1024=256M
由上面的分析可知, hash函数是存在hash冲突的, 所以布隆过滤器是会有误判的情况.
表现为:
如果某条记录被判断为不存在,则该记录必然不存在 如果某条记录被判断存在,则该记录可能会不存在 复制代码
public class BloomFilterDemo { private static final int insertions = 1000000; @Test public void bfTest1(){ //初始化一个存储string数据的布隆过滤器,初始化大小100w,不能设置为0 BloomFilter<String> bf = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), insertions,0.001); //初始化一个存储string数据的set,初始化大小100w Set<String> sets = new HashSet<>(insertions); //初始化一个存储string数据的set,初始化大小100w List<String> lists = new ArrayList<>(insertions); //向三个容器初始化100万个随机并且唯一的字符串---初始化操作 for (int i = 0; i < insertions; i++) { String uuid = UUID.randomUUID().toString(); bf.put(uuid); sets.add(uuid); lists.add(uuid); } //布隆过滤器错误判断的次数 int wrong = 0; //布隆过滤器正确判断的次数 int right = 0; for (int i = 0; i < 10000; i++) { //按照一定比例选择bf中肯定存在的字符串 String test = i%100==0?lists.get(i/100):UUID.randomUUID().toString(); if(bf.mightContain(test)){ if(sets.contains(test)){ right ++; }else{ wrong ++; } } } //100 System.out.println("=================right====================="+right); System.out.println("=================wrong====================="+wrong); } @Test public void bfTest2() { //预计要插入多少数据 int size = 1000000; //期望的误判率 double fpp = 0.01; BloomFilter<Integer> bloomFilter = BloomFilter.create(Funnels.integerFunnel(), size, fpp); //插入数据 for (int i = 0; i < 1000000; i++) { bloomFilter.put(i); } int count = 0; for (int i = 0; i < 1000000; i++) { if (!bloomFilter.mightContain(i)) { count++; System.out.println(i + "误判了"); } } for (int i = 1000000; i < 2000000; i++) { if (bloomFilter.mightContain(i)) { count++; System.out.println(i + "误判了"); } } System.out.println("总共的误判数:" + count); } } 复制代码
/** * * 原理是和 JDK 自带的BloomFilter类似的,我们看add方法,它先 Redis 缓存中是否有指定 key(如:orderBloomFilter) 的值,如果没有,则在 offset = 0 处,添加一个值为false,即为 0; * 然后调用createHashes(byte[] data, int hashes)方法,根据字节数组的内容生成digest,并将结果分割成 4 字节的整数并存储在数组中,数组中的整数可以理解为每次hash所得的hashcode的值。 * 最后,遍历hashcode数组,将hashcode%sizeOfBloomFilter取模所得下标所对应的值设为true,即为 1。 * * contains方法,同样调用createHashes(byte[] data, int hashes)得到字节数组内容所对应的hashcode数组。 * 遍历hashcode数组,如果有一个hashcode所对应的下标的值不为1,则该数据不存在。反之,只有所有的hashcode所对应的下标的值都为1,才能说明该数据已经存在。 * * @author: ZENGQINGXUN178 * @Date: 2019-8-27 10:30 * @Description: */ @Component public class BloomFilterService<E> { @Autowired private JedisCluster jedisCluster; /** * total length of the Bloom filter */ private static int sizeOfBloomFilter; /** * number of hash functions */ private static int numberOfHashFunctions; /** * 误差率 */ private static final double falsePositiveProbability = 0.01; /** * 预计容量 */ private static final int expectedNumberOfElements = 10000; private static String bloom_name = "bloom_name_1"; /** * encoding used for storing hash values as strings */ private final Charset charset = Charset.forName("UTF-8"); /** * MD5 gives good enough accuracy in most circumstances. Change to SHA1 if it's needed */ private static final String hashName = "MD5"; private static final MessageDigest digestFunction; /** * The digest method is reused between instances */ static { MessageDigest messageDigest; try { messageDigest = java.security.MessageDigest.getInstance(hashName); } catch (NoSuchAlgorithmException e) { messageDigest = null; } digestFunction = messageDigest; // numberOfHashFunctions = ceil(-ln(falsePositiveProbability)/ln2) numberOfHashFunctions = (int) Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))); // sizeOfBloomFilter = ceil(numberOfHashFunctions*expectedNumberOfElements/ln2) sizeOfBloomFilter = (int) Math.ceil(numberOfHashFunctions * expectedNumberOfElements / Math.log(2)); } @PostConstruct public void init(){ // 1. 获取数据 List<String> datas = new ArrayList<>(); for (int i = 0; i < expectedNumberOfElements; i++) { datas.add(i + ""); } if (jedisCluster.get(bloom_name) == null) { jedisCluster.setbit(bloom_name, 0, false); } // 2. 把数据放进bloomFilter JedisClusterPipeline pipelined = JedisClusterPipeline.pipelined(jedisCluster); try { datas.forEach(e -> add(e.getBytes(charset),pipelined)); }finally { pipelined.syncAndReturnAll(); } } /** * Adds an object to the Bloom filter. The output from the object's * toString() method is used as input to the hash functions. * * @param element is an element to register in the Bloom filter. */ public void add(E element) { add(element.toString().getBytes(charset)); } private void add(byte[] bytes) { int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { jedisCluster.setbit(bloom_name, Math.abs(hash % sizeOfBloomFilter), true); } } /** * Adds an array of bytes to the Bloom filter. * * @param bytes array of bytes to add to the Bloom filter. */ private void add(byte[] bytes,JedisClusterPipeline pipelined) { int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { pipelined.setbit(bloom_name, Math.abs(hash % sizeOfBloomFilter), true); } } /** * Adds all elements from a Collection to the Bloom filter. * * @param c Collection of elements. */ public void addAll(Collection<? extends E> c) { for (E element : c) { add(element); } } /** * Returns true if the element could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param element element to check. * @return true if the element could have been inserted into the Bloom filter. */ public boolean contains(E element) { return contains(element.toString().getBytes(charset)); } public int failCount(List<E> elements) { JedisClusterPipeline pipelined = JedisClusterPipeline.pipelined(jedisCluster); int count = 0; try { for (E e : elements) { if (!contains(e.toString().getBytes(charset), pipelined)) { count++; } } }finally { pipelined.close(); } return count; } /** * Returns true if the array of bytes could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param bytes array of bytes to check. * @return true if the array could have been inserted into the Bloom filter. */ private boolean contains(byte[] bytes) { int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { if (!jedisCluster.getbit(bloom_name, Math.abs(hash % sizeOfBloomFilter))) { return false; } } return true; } private boolean contains(byte[] bytes,JedisClusterPipeline pipelined) { int[] hashes = createHashes(bytes, numberOfHashFunctions); for (int hash : hashes) { pipelined.getbit(bloom_name, Math.abs(hash % sizeOfBloomFilter)); } List<Object> objects = pipelined.syncAndReturnAll(); Optional<Object> any = objects.stream().filter(Boolean.FALSE::equals).findAny(); return any.isPresent(); } /** * Returns true if all the elements of a Collection could have been inserted * into the Bloom filter. Use getFalsePositiveProbability() to calculate the * probability of this being correct. * * @param c elements to check. * @return true if all the elements in c could have been inserted into the Bloom filter. */ public boolean containsAll(Collection<? extends E> c) { for (E element : c) { if (!contains(element)) { return false; } } return true; } /** * 根据字节数组的内容生成摘要,并将结果分割成 4 字节的整数并存储在数组中。 * 调用 digest 函数,直到生成所需的 int 数。每次生成摘要之前,都需要加盐,并且 salt++。 * * @param data specifies input data. * @param hashes number of hashes/int's to produce. * @return array of int-sized hashes */ private static int[] createHashes(byte[] data, int hashes) { int[] result = new int[hashes]; int k = 0; byte salt = 0; while (k < hashes) { byte[] digest; synchronized (digestFunction) { digestFunction.update(salt); salt++; digest = digestFunction.digest(data); } for (int i = 0; i < digest.length / 4 && k < hashes; i++) { int h = 0; for (int j = (i * 4); j < (i * 4) + 4; j++) { h <<= 8; h |= ((int) digest[j]) & 0xFF; } result[k] = h; k++; } } return result; } /** * Calculates a hash code for this class. * * @return hash code representing the contents of an instance of this class. */ @Override public int hashCode() { int hash = 7; hash = 61 * hash + sizeOfBloomFilter; hash = 61 * hash + expectedNumberOfElements; hash = 61 * hash + numberOfHashFunctions; return hash; } } 复制代码