cobalt / cobalt / 25399f97090f8a86f56525c38486080cb2747250 / . / src / third_party / mozjs-45 / mfbt / BloomFilter.h

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ | |

/* vim: set ts=8 sts=2 et sw=2 tw=80: */ | |

/* This Source Code Form is subject to the terms of the Mozilla Public | |

* License, v. 2.0. If a copy of the MPL was not distributed with this | |

* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | |

/* | |

* A counting Bloom filter implementation. This allows consumers to | |

* do fast probabilistic "is item X in set Y?" testing which will | |

* never answer "no" when the correct answer is "yes" (but might | |

* incorrectly answer "yes" when the correct answer is "no"). | |

*/ | |

#ifndef mozilla_BloomFilter_h | |

#define mozilla_BloomFilter_h | |

#include "mozilla/Assertions.h" | |

#include "mozilla/Likely.h" | |

#include <stdint.h> | |

#include <string.h> | |

namespace mozilla { | |

/* | |

* This class implements a counting Bloom filter as described at | |

* <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with | |

* 8-bit counters. This allows quick probabilistic answers to the | |

* question "is object X in set Y?" where the contents of Y might not | |

* be time-invariant. The probabilistic nature of the test means that | |

* sometimes the answer will be "yes" when it should be "no". If the | |

* answer is "no", then X is guaranteed not to be in Y. | |

* | |

* The filter is parametrized on KeySize, which is the size of the key | |

* generated by each of hash functions used by the filter, in bits, | |

* and the type of object T being added and removed. T must implement | |

* a |uint32_t hash() const| method which returns a uint32_t hash key | |

* that will be used to generate the two separate hash functions for | |

* the Bloom filter. This hash key MUST be well-distributed for good | |

* results! KeySize is not allowed to be larger than 16. | |

* | |

* The filter uses exactly 2**KeySize bytes of memory. From now on we | |

* will refer to the memory used by the filter as M. | |

* | |

* The expected rate of incorrect "yes" answers depends on M and on | |

* the number N of objects in set Y. As long as N is small compared | |

* to M, the rate of such answers is expected to be approximately | |

* 4*(N/M)**2 for this filter. In practice, if Y has a few hundred | |

* elements then using a KeySize of 12 gives a reasonably low | |

* incorrect answer rate. A KeySize of 12 has the additional benefit | |

* of using exactly one page for the filter in typical hardware | |

* configurations. | |

*/ | |

template<unsigned KeySize, class T> | |

class BloomFilter | |

{ | |

/* | |

* A counting Bloom filter with 8-bit counters. For now we assume | |

* that having two hash functions is enough, but we may revisit that | |

* decision later. | |

* | |

* The filter uses an array with 2**KeySize entries. | |

* | |

* Assuming a well-distributed hash function, a Bloom filter with | |

* array size M containing N elements and | |

* using k hash function has expected false positive rate exactly | |

* | |

* $ (1 - (1 - 1/M)^{kN})^k $ | |

* | |

* because each array slot has a | |

* | |

* $ (1 - 1/M)^{kN} $ | |

* | |

* chance of being 0, and the expected false positive rate is the | |

* probability that all of the k hash functions will hit a nonzero | |

* slot. | |

* | |

* For reasonable assumptions (M large, kN large, which should both | |

* hold if we're worried about false positives) about M and kN this | |

* becomes approximately | |

* | |

* $$ (1 - \exp(-kN/M))^k $$ | |

* | |

* For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, | |

* or in other words | |

* | |

* $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ | |

* | |

* where r is the false positive rate. This can be used to compute | |

* the desired KeySize for a given load N and false positive rate r. | |

* | |

* If N/M is assumed small, then the false positive rate can | |

* further be approximated as 4*N^2/M^2. So increasing KeySize by | |

* 1, which doubles M, reduces the false positive rate by about a | |

* factor of 4, and a false positive rate of 1% corresponds to | |

* about M/N == 20. | |

* | |

* What this means in practice is that for a few hundred keys using a | |

* KeySize of 12 gives false positive rates on the order of 0.25-4%. | |

* | |

* Similarly, using a KeySize of 10 would lead to a 4% false | |

* positive rate for N == 100 and to quite bad false positive | |

* rates for larger N. | |

*/ | |

public: | |

BloomFilter() | |

{ | |

static_assert(KeySize <= kKeyShift, "KeySize too big"); | |

// Should we have a custom operator new using calloc instead and | |

// require that we're allocated via the operator? | |

clear(); | |

} | |

/* | |

* Clear the filter. This should be done before reusing it, because | |

* just removing all items doesn't clear counters that hit the upper | |

* bound. | |

*/ | |

void clear(); | |

/* | |

* Add an item to the filter. | |

*/ | |

void add(const T* aValue); | |

/* | |

* Remove an item from the filter. | |

*/ | |

void remove(const T* aValue); | |

/* | |

* Check whether the filter might contain an item. This can | |

* sometimes return true even if the item is not in the filter, | |

* but will never return false for items that are actually in the | |

* filter. | |

*/ | |

bool mightContain(const T* aValue) const; | |

/* | |

* Methods for add/remove/contain when we already have a hash computed | |

*/ | |

void add(uint32_t aHash); | |

void remove(uint32_t aHash); | |

bool mightContain(uint32_t aHash) const; | |

private: | |

static const size_t kArraySize = (1 << KeySize); | |

static const uint32_t kKeyMask = (1 << KeySize) - 1; | |

static const uint32_t kKeyShift = 16; | |

static uint32_t hash1(uint32_t aHash) | |

{ | |

return aHash & kKeyMask; | |

} | |

static uint32_t hash2(uint32_t aHash) | |

{ | |

return (aHash >> kKeyShift) & kKeyMask; | |

} | |

uint8_t& firstSlot(uint32_t aHash) | |

{ | |

return mCounters[hash1(aHash)]; | |

} | |

uint8_t& secondSlot(uint32_t aHash) | |

{ | |

return mCounters[hash2(aHash)]; | |

} | |

const uint8_t& firstSlot(uint32_t aHash) const | |

{ | |

return mCounters[hash1(aHash)]; | |

} | |

const uint8_t& secondSlot(uint32_t aHash) const | |

{ | |

return mCounters[hash2(aHash)]; | |

} | |

static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; } | |

uint8_t mCounters[kArraySize]; | |

}; | |

template<unsigned KeySize, class T> | |

inline void | |

BloomFilter<KeySize, T>::clear() | |

{ | |

memset(mCounters, 0, kArraySize); | |

} | |

template<unsigned KeySize, class T> | |

inline void | |

BloomFilter<KeySize, T>::add(uint32_t aHash) | |

{ | |

uint8_t& slot1 = firstSlot(aHash); | |

if (MOZ_LIKELY(!full(slot1))) { | |

++slot1; | |

} | |

uint8_t& slot2 = secondSlot(aHash); | |

if (MOZ_LIKELY(!full(slot2))) { | |

++slot2; | |

} | |

} | |

template<unsigned KeySize, class T> | |

MOZ_ALWAYS_INLINE void | |

BloomFilter<KeySize, T>::add(const T* aValue) | |

{ | |

uint32_t hash = aValue->hash(); | |

return add(hash); | |

} | |

template<unsigned KeySize, class T> | |

inline void | |

BloomFilter<KeySize, T>::remove(uint32_t aHash) | |

{ | |

// If the slots are full, we don't know whether we bumped them to be | |

// there when we added or not, so just leave them full. | |

uint8_t& slot1 = firstSlot(aHash); | |

if (MOZ_LIKELY(!full(slot1))) { | |

--slot1; | |

} | |

uint8_t& slot2 = secondSlot(aHash); | |

if (MOZ_LIKELY(!full(slot2))) { | |

--slot2; | |

} | |

} | |

template<unsigned KeySize, class T> | |

MOZ_ALWAYS_INLINE void | |

BloomFilter<KeySize, T>::remove(const T* aValue) | |

{ | |

uint32_t hash = aValue->hash(); | |

remove(hash); | |

} | |

template<unsigned KeySize, class T> | |

MOZ_ALWAYS_INLINE bool | |

BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const | |

{ | |

// Check that all the slots for this hash contain something | |

return firstSlot(aHash) && secondSlot(aHash); | |

} | |

template<unsigned KeySize, class T> | |

MOZ_ALWAYS_INLINE bool | |

BloomFilter<KeySize, T>::mightContain(const T* aValue) const | |

{ | |

uint32_t hash = aValue->hash(); | |

return mightContain(hash); | |

} | |

} // namespace mozilla | |

#endif /* mozilla_BloomFilter_h */ |