src/third_party/mozjs-45/mfbt/BloomFilter.h - cobalt - Git at Google

 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 /*
  * A counting Bloom filter implementation.  This allows consumers to
  * do fast probabilistic "is item X in set Y?" testing which will
  * never answer "no" when the correct answer is "yes" (but might
  * incorrectly answer "yes" when the correct answer is "no").
  */

 #ifndef mozilla_BloomFilter_h
 #define mozilla_BloomFilter_h

 #include "mozilla/Assertions.h"
 #include "mozilla/Likely.h"

 #include <stdint.h>
 #include <string.h>

 namespace mozilla {

 /*
  * This class implements a counting Bloom filter as described at
  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
  * 8-bit counters.  This allows quick probabilistic answers to the
  * question "is object X in set Y?" where the contents of Y might not
  * be time-invariant.  The probabilistic nature of the test means that
  * sometimes the answer will be "yes" when it should be "no".  If the
  * answer is "no", then X is guaranteed not to be in Y.
  *
  * The filter is parametrized on KeySize, which is the size of the key
  * generated by each of hash functions used by the filter, in bits,
  * and the type of object T being added and removed.  T must implement
  * a |uint32_t hash() const| method which returns a uint32_t hash key
  * that will be used to generate the two separate hash functions for
  * the Bloom filter.  This hash key MUST be well-distributed for good
  * results!  KeySize is not allowed to be larger than 16.
  *
  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
  * will refer to the memory used by the filter as M.
  *
  * The expected rate of incorrect "yes" answers depends on M and on
  * the number N of objects in set Y.  As long as N is small compared
  * to M, the rate of such answers is expected to be approximately
  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
  * elements then using a KeySize of 12 gives a reasonably low
  * incorrect answer rate.  A KeySize of 12 has the additional benefit
  * of using exactly one page for the filter in typical hardware
  * configurations.
  */

 template<unsigned KeySize, class T>
 class BloomFilter
 {
   /*
    * A counting Bloom filter with 8-bit counters.  For now we assume
    * that having two hash functions is enough, but we may revisit that
    * decision later.
    *
    * The filter uses an array with 2**KeySize entries.
    *
    * Assuming a well-distributed hash function, a Bloom filter with
    * array size M containing N elements and
    * using k hash function has expected false positive rate exactly
    *
    * $  (1 - (1 - 1/M)^{kN})^k  $
    *
    * because each array slot has a
    *
    * $  (1 - 1/M)^{kN}  $
    *
    * chance of being 0, and the expected false positive rate is the
    * probability that all of the k hash functions will hit a nonzero
    * slot.
    *
    * For reasonable assumptions (M large, kN large, which should both
    * hold if we're worried about false positives) about M and kN this
    * becomes approximately
    *
    * $$  (1 - \exp(-kN/M))^k   $$
    *
    * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
    * or in other words
    *
    * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
    *
    * where r is the false positive rate.  This can be used to compute
    * the desired KeySize for a given load N and false positive rate r.
    *
    * If N/M is assumed small, then the false positive rate can
    * further be approximated as 4*N^2/M^2.  So increasing KeySize by
    * 1, which doubles M, reduces the false positive rate by about a
    * factor of 4, and a false positive rate of 1% corresponds to
    * about M/N == 20.
    *
    * What this means in practice is that for a few hundred keys using a
    * KeySize of 12 gives false positive rates on the order of 0.25-4%.
    *
    * Similarly, using a KeySize of 10 would lead to a 4% false
    * positive rate for N == 100 and to quite bad false positive
    * rates for larger N.
    */
 public:
   BloomFilter()
   {
     static_assert(KeySize <= kKeyShift, "KeySize too big");

     // Should we have a custom operator new using calloc instead and
     // require that we're allocated via the operator?
     clear();
   }

   /*
    * Clear the filter.  This should be done before reusing it, because
    * just removing all items doesn't clear counters that hit the upper
    * bound.
    */
   void clear();

   /*
    * Add an item to the filter.
    */
   void add(const T* aValue);

   /*
    * Remove an item from the filter.
    */
   void remove(const T* aValue);

   /*
    * Check whether the filter might contain an item.  This can
    * sometimes return true even if the item is not in the filter,
    * but will never return false for items that are actually in the
    * filter.
    */
   bool mightContain(const T* aValue) const;

   /*
    * Methods for add/remove/contain when we already have a hash computed
    */
   void add(uint32_t aHash);
   void remove(uint32_t aHash);
   bool mightContain(uint32_t aHash) const;

 private:
   static const size_t kArraySize = (1 << KeySize);
   static const uint32_t kKeyMask = (1 << KeySize) - 1;
   static const uint32_t kKeyShift = 16;

   static uint32_t hash1(uint32_t aHash)
   {
     return aHash & kKeyMask;
   }
   static uint32_t hash2(uint32_t aHash)
   {
     return (aHash >> kKeyShift) & kKeyMask;
   }

   uint8_t& firstSlot(uint32_t aHash)
   {
     return mCounters[hash1(aHash)];
   }
   uint8_t& secondSlot(uint32_t aHash)
   {
     return mCounters[hash2(aHash)];
   }

   const uint8_t& firstSlot(uint32_t aHash) const
   {
     return mCounters[hash1(aHash)];
   }
   const uint8_t& secondSlot(uint32_t aHash) const
   {
     return mCounters[hash2(aHash)];
   }

   static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }

   uint8_t mCounters[kArraySize];
 };

 template<unsigned KeySize, class T>
 inline void
 BloomFilter<KeySize, T>::clear()
 {
   memset(mCounters, 0, kArraySize);
 }

 template<unsigned KeySize, class T>
 inline void
 BloomFilter<KeySize, T>::add(uint32_t aHash)
 {
   uint8_t& slot1 = firstSlot(aHash);
   if (MOZ_LIKELY(!full(slot1))) {
     ++slot1;
   }
   uint8_t& slot2 = secondSlot(aHash);
   if (MOZ_LIKELY(!full(slot2))) {
     ++slot2;
   }
 }

 template<unsigned KeySize, class T>
 MOZ_ALWAYS_INLINE void
 BloomFilter<KeySize, T>::add(const T* aValue)
 {
   uint32_t hash = aValue->hash();
   return add(hash);
 }

 template<unsigned KeySize, class T>
 inline void
 BloomFilter<KeySize, T>::remove(uint32_t aHash)
 {
   // If the slots are full, we don't know whether we bumped them to be
   // there when we added or not, so just leave them full.
   uint8_t& slot1 = firstSlot(aHash);
   if (MOZ_LIKELY(!full(slot1))) {
     --slot1;
   }
   uint8_t& slot2 = secondSlot(aHash);
   if (MOZ_LIKELY(!full(slot2))) {
     --slot2;
   }
 }

 template<unsigned KeySize, class T>
 MOZ_ALWAYS_INLINE void
 BloomFilter<KeySize, T>::remove(const T* aValue)
 {
   uint32_t hash = aValue->hash();
   remove(hash);
 }

 template<unsigned KeySize, class T>
 MOZ_ALWAYS_INLINE bool
 BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
 {
   // Check that all the slots for this hash contain something
   return firstSlot(aHash) && secondSlot(aHash);
 }

 template<unsigned KeySize, class T>
 MOZ_ALWAYS_INLINE bool
 BloomFilter<KeySize, T>::mightContain(const T* aValue) const
 {
   uint32_t hash = aValue->hash();
   return mightContain(hash);
 }

 } // namespace mozilla

 #endif /* mozilla_BloomFilter_h */
	/* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -- */
	/* vim: set ts=8 sts=2 et sw=2 tw=80: */
	/* This Source Code Form is subject to the terms of the Mozilla Public
	* License, v. 2.0. If a copy of the MPL was not distributed with this
	* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	/*
	* A counting Bloom filter implementation. This allows consumers to
	* do fast probabilistic "is item X in set Y?" testing which will
	* never answer "no" when the correct answer is "yes" (but might
	* incorrectly answer "yes" when the correct answer is "no").
	*/

	#ifndef mozilla_BloomFilter_h
	#define mozilla_BloomFilter_h

	#include "mozilla/Assertions.h"
	#include "mozilla/Likely.h"

	#include <stdint.h>
	#include <string.h>

	namespace mozilla {

	/*
	* This class implements a counting Bloom filter as described at
	* <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
	* 8-bit counters. This allows quick probabilistic answers to the
	* question "is object X in set Y?" where the contents of Y might not
	* be time-invariant. The probabilistic nature of the test means that
	* sometimes the answer will be "yes" when it should be "no". If the
	* answer is "no", then X is guaranteed not to be in Y.
	*
	* The filter is parametrized on KeySize, which is the size of the key
	* generated by each of hash functions used by the filter, in bits,
	* and the type of object T being added and removed. T must implement
	* a \|uint32_t hash() const\| method which returns a uint32_t hash key
	* that will be used to generate the two separate hash functions for
	* the Bloom filter. This hash key MUST be well-distributed for good
	* results! KeySize is not allowed to be larger than 16.
	*
	* The filter uses exactly 2**KeySize bytes of memory. From now on we
	* will refer to the memory used by the filter as M.
	*
	* The expected rate of incorrect "yes" answers depends on M and on
	* the number N of objects in set Y. As long as N is small compared
	* to M, the rate of such answers is expected to be approximately
	* 4(N/M)*2 for this filter. In practice, if Y has a few hundred
	* elements then using a KeySize of 12 gives a reasonably low
	* incorrect answer rate. A KeySize of 12 has the additional benefit
	* of using exactly one page for the filter in typical hardware
	* configurations.
	*/

	template<unsigned KeySize, class T>
	class BloomFilter
	{
	/*
	* A counting Bloom filter with 8-bit counters. For now we assume
	* that having two hash functions is enough, but we may revisit that
	* decision later.
	*
	* The filter uses an array with 2**KeySize entries.
	*
	* Assuming a well-distributed hash function, a Bloom filter with
	* array size M containing N elements and
	* using k hash function has expected false positive rate exactly
	*
	* $ (1 - (1 - 1/M)^{kN})^k $
	*
	* because each array slot has a
	*
	* $ (1 - 1/M)^{kN} $
	*
	* chance of being 0, and the expected false positive rate is the
	* probability that all of the k hash functions will hit a nonzero
	* slot.
	*
	* For reasonable assumptions (M large, kN large, which should both
	* hold if we're worried about false positives) about M and kN this
	* becomes approximately
	*
	* $$ (1 - \exp(-kN/M))^k $$
	*
	* For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
	* or in other words
	*
	* $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
	*
	* where r is the false positive rate. This can be used to compute
	* the desired KeySize for a given load N and false positive rate r.
	*
	* If N/M is assumed small, then the false positive rate can
	* further be approximated as 4*N^2/M^2. So increasing KeySize by
	* 1, which doubles M, reduces the false positive rate by about a
	* factor of 4, and a false positive rate of 1% corresponds to
	* about M/N == 20.
	*
	* What this means in practice is that for a few hundred keys using a
	* KeySize of 12 gives false positive rates on the order of 0.25-4%.
	*
	* Similarly, using a KeySize of 10 would lead to a 4% false
	* positive rate for N == 100 and to quite bad false positive
	* rates for larger N.
	*/
	public:
	BloomFilter()
	{
	static_assert(KeySize <= kKeyShift, "KeySize too big");

	// Should we have a custom operator new using calloc instead and
	// require that we're allocated via the operator?
	clear();
	}

	/*
	* Clear the filter. This should be done before reusing it, because
	* just removing all items doesn't clear counters that hit the upper
	* bound.
	*/
	void clear();

	/*
	* Add an item to the filter.
	*/
	void add(const T* aValue);

	/*
	* Remove an item from the filter.
	*/
	void remove(const T* aValue);

	/*
	* Check whether the filter might contain an item. This can
	* sometimes return true even if the item is not in the filter,
	* but will never return false for items that are actually in the
	* filter.
	*/
	bool mightContain(const T* aValue) const;

	/*
	* Methods for add/remove/contain when we already have a hash computed
	*/
	void add(uint32_t aHash);
	void remove(uint32_t aHash);
	bool mightContain(uint32_t aHash) const;

	private:
	static const size_t kArraySize = (1 << KeySize);
	static const uint32_t kKeyMask = (1 << KeySize) - 1;
	static const uint32_t kKeyShift = 16;

	static uint32_t hash1(uint32_t aHash)
	{
	return aHash & kKeyMask;
	}
	static uint32_t hash2(uint32_t aHash)
	{
	return (aHash >> kKeyShift) & kKeyMask;
	}

	uint8_t& firstSlot(uint32_t aHash)
	{
	return mCounters[hash1(aHash)];
	}
	uint8_t& secondSlot(uint32_t aHash)
	{
	return mCounters[hash2(aHash)];
	}

	const uint8_t& firstSlot(uint32_t aHash) const
	{
	return mCounters[hash1(aHash)];
	}
	const uint8_t& secondSlot(uint32_t aHash) const
	{
	return mCounters[hash2(aHash)];
	}

	static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }

	uint8_t mCounters[kArraySize];
	};

	template<unsigned KeySize, class T>
	inline void
	BloomFilter<KeySize, T>::clear()
	{
	memset(mCounters, 0, kArraySize);
	}

	template<unsigned KeySize, class T>
	inline void
	BloomFilter<KeySize, T>::add(uint32_t aHash)
	{
	uint8_t& slot1 = firstSlot(aHash);
	if (MOZ_LIKELY(!full(slot1))) {
	++slot1;
	}
	uint8_t& slot2 = secondSlot(aHash);
	if (MOZ_LIKELY(!full(slot2))) {
	++slot2;
	}
	}

	template<unsigned KeySize, class T>
	MOZ_ALWAYS_INLINE void
	BloomFilter<KeySize, T>::add(const T* aValue)
	{
	uint32_t hash = aValue->hash();
	return add(hash);
	}

	template<unsigned KeySize, class T>
	inline void
	BloomFilter<KeySize, T>::remove(uint32_t aHash)
	{
	// If the slots are full, we don't know whether we bumped them to be
	// there when we added or not, so just leave them full.
	uint8_t& slot1 = firstSlot(aHash);
	if (MOZ_LIKELY(!full(slot1))) {
	--slot1;
	}
	uint8_t& slot2 = secondSlot(aHash);
	if (MOZ_LIKELY(!full(slot2))) {
	--slot2;
	}
	}

	template<unsigned KeySize, class T>
	MOZ_ALWAYS_INLINE void
	BloomFilter<KeySize, T>::remove(const T* aValue)
	{
	uint32_t hash = aValue->hash();
	remove(hash);
	}

	template<unsigned KeySize, class T>
	MOZ_ALWAYS_INLINE bool
	BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
	{
	// Check that all the slots for this hash contain something
	return firstSlot(aHash) && secondSlot(aHash);
	}

	template<unsigned KeySize, class T>
	MOZ_ALWAYS_INLINE bool
	BloomFilter<KeySize, T>::mightContain(const T* aValue) const
	{
	uint32_t hash = aValue->hash();
	return mightContain(hash);
	}

	} // namespace mozilla

	#endif /* mozilla_BloomFilter_h */