src/third_party/icu/source/test/perf/unisetperf/draft/bitset.cpp - cobalt - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  bitset.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2007jan15
 *   created by: Markus Scherer
 *
 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
 *   using a folded bit set consisting of a 1k-entry index table and a
 *   compacted array of 64-bit words.
 *   Uses a simple hash table for compaction.
 *   Uses the original set for supplementary code points.
 */

 #include "unicode/utypes.h"
 #include "unicont.h"
 #include "cmemory.h" // for UPRV_LENGTHOF

 /*
  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
  * Hashes 64-bit words and maps them to 16-bit integers which are
  * assigned in order of new incoming words for subsequent storage
  * in a contiguous array.
  */
 struct BMPBitHash : public UObject {
     int64_t keys[0x800];  // 2k
     uint16_t values[0x800];
     uint16_t reverse[0x400];
     uint16_t count;
     const int32_t prime=1301;  // Less than 2k.

     BMPBitHash() : count(0) {
         // Fill values[] with 0xffff.
         uprv_memset(values, 0xff, sizeof(values));
     }

     /*
      * Map a key to an integer count.
      * Map at most 1k=0x400 different keys with this data structure.
      */
     uint16_t map(int64_t key) {
         int32_t hash=(int32_t)(key>>55)&0x1ff;
         hash^=(int32_t)(key>>44)&0x7ff;
         hash^=(int32_t)(key>>33)&0x7ff;
         hash^=(int32_t)(key>>22)&0x7ff;
         hash^=(int32_t)(key>>11)&0x7ff;
         hash^=(int32_t)key&0x7ff;
         for(;;) {
             if(values[hash]==0xffff) {
                 // Unused slot.
                 keys[hash]=key;
                 reverse[count]=hash;
                 return values[hash]=count++;
             } else if(keys[hash]==key) {
                 // Found a slot with this key.
                 return values[hash];
             } else {
                 // Used slot with a different key, move to another slot.
                 hash=(hash+prime)&0x7ff;
             }
         }
     }

     uint16_t countKeys() const { return count; }

     /*
      * Invert the hash map: Fill an array of length countKeys() with the keys
      * indexed by their mapped values.
      */
     void invert(int64_t *k) const {
         uint16_t i;

         for(i=0; i<count; ++i) {
             k[i]=keys[reverse[i]];
         }
     }
 };

 class BitSet : public UObject, public UnicodeContainable {
 public:
     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
         if(U_FAILURE(errorCode)) {
             return;
         }
         BMPBitHash *bitHash=new BMPBitHash;
         if(bitHash==NULL || restSet==NULL) {
             errorCode=U_MEMORY_ALLOCATION_ERROR;
             return;
         }

         UnicodeSetIterator iter(set);
         int64_t b;
         UChar32 start, end;
         int32_t prevIndex, i, j;

         b=0;  // Not necessary but makes compilers happy.
         prevIndex=-1;
         for(;;) {
             if(iter.nextRange() && !iter.isString()) {
                 start=iter.getCodepoint();
                 end=iter.getCodepointEnd();
             } else {
                 start=0x10000;
             }
             i=start>>6;
             if(prevIndex!=i) {
                 // Finish the end of the previous range.
                 if(prevIndex<0) {
                     prevIndex=0;
                 } else {
                     index[prevIndex++]=bitHash->map(b);
                 }
                 // Fill all-zero entries between ranges.
                 if(prevIndex<i) {
                     uint16_t zero=bitHash->map(0);
                     do {
                         index[prevIndex++]=zero;
                     } while(prevIndex<i);
                 }
                 b=0;
             }
             if(start>0xffff) {
                 break;
             }
             b|=~((INT64_C(1)<<(start&0x3f))-1);
             j=end>>6;
             if(i<j) {
                 // Set bits for the start of the range.
                 index[i++]=bitHash->map(b);
                 // Fill all-one entries inside the range.
                 if(i<j) {
                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
                     do {
                         index[i++]=all;
                     } while(i<j);
                 }
                 b=INT64_C(0xffffffffffffffff);
             }
             /* i==j */
             b&=(INT64_C(1)<<(end&0x3f))-1;
             prevIndex=j;
         }

         if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
         }
         if(bits!=NULL) {
             bitHash->invert(bits);
         } else {
             bits=shortBits;
             errorCode=U_MEMORY_ALLOCATION_ERROR;
             return;
         }

         latin1Set[0]=(uint32_t)bits[0];
         latin1Set[1]=(uint32_t)(bits[0]>>32);
         latin1Set[2]=(uint32_t)bits[1];
         latin1Set[3]=(uint32_t)(bits[1]>>32);
         latin1Set[4]=(uint32_t)bits[2];
         latin1Set[5]=(uint32_t)(bits[2]>>32);
         latin1Set[6]=(uint32_t)bits[3];
         latin1Set[7]=(uint32_t)(bits[3]>>32);

         restSet.remove(0, 0xffff);
     }

     ~BitSet() {
         if(bits!=shortBits) {
             uprv_free(bits);
         }
         delete restSet;
     }

     UBool contains(UChar32 c) const {
         if((uint32_t)c<=0xff) {
             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
         } else if((uint32_t)c<0xffff) {
             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
         } else {
             return restSet->contains(c);
         }
     }

 private:
     uint16_t index[0x400];
     int64_t shortBits[32];
     int64_t *bits;

     uint32_t latin1Bits[8];

     UnicodeSet *restSet;
 };
	/*
	**********************************************************************
	* Copyright (C) 2014, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* file name: bitset.cpp
	* encoding: US-ASCII
	* tab size: 8 (not used)
	* indentation:4
	*
	* created on: 2007jan15
	* created by: Markus Scherer
	*
	* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
	* using a folded bit set consisting of a 1k-entry index table and a
	* compacted array of 64-bit words.
	* Uses a simple hash table for compaction.
	* Uses the original set for supplementary code points.
	*/

	#include "unicode/utypes.h"
	#include "unicont.h"
	#include "cmemory.h" // for UPRV_LENGTHOF

	/*
	* Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
	* Hashes 64-bit words and maps them to 16-bit integers which are
	* assigned in order of new incoming words for subsequent storage
	* in a contiguous array.
	*/
	struct BMPBitHash : public UObject {
	int64_t keys[0x800]; // 2k
	uint16_t values[0x800];
	uint16_t reverse[0x400];
	uint16_t count;
	const int32_t prime=1301; // Less than 2k.

	BMPBitHash() : count(0) {
	// Fill values[] with 0xffff.
	uprv_memset(values, 0xff, sizeof(values));
	}

	/*
	* Map a key to an integer count.
	* Map at most 1k=0x400 different keys with this data structure.
	*/
	uint16_t map(int64_t key) {
	int32_t hash=(int32_t)(key>>55)&0x1ff;
	hash^=(int32_t)(key>>44)&0x7ff;
	hash^=(int32_t)(key>>33)&0x7ff;
	hash^=(int32_t)(key>>22)&0x7ff;
	hash^=(int32_t)(key>>11)&0x7ff;
	hash^=(int32_t)key&0x7ff;
	for(;;) {
	if(values[hash]==0xffff) {
	// Unused slot.
	keys[hash]=key;
	reverse[count]=hash;
	return values[hash]=count++;
	} else if(keys[hash]==key) {
	// Found a slot with this key.
	return values[hash];
	} else {
	// Used slot with a different key, move to another slot.
	hash=(hash+prime)&0x7ff;
	}
	}
	}

	uint16_t countKeys() const { return count; }

	/*
	* Invert the hash map: Fill an array of length countKeys() with the keys
	* indexed by their mapped values.
	*/
	void invert(int64_t *k) const {
	uint16_t i;

	for(i=0; i<count; ++i) {
	k[i]=keys[reverse[i]];
	}
	}
	};

	class BitSet : public UObject, public UnicodeContainable {
	public:
	BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
	if(U_FAILURE(errorCode)) {
	return;
	}
	BMPBitHash *bitHash=new BMPBitHash;
	if(bitHash==NULL \|\| restSet==NULL) {
	errorCode=U_MEMORY_ALLOCATION_ERROR;
	return;
	}

	UnicodeSetIterator iter(set);
	int64_t b;
	UChar32 start, end;
	int32_t prevIndex, i, j;

	b=0; // Not necessary but makes compilers happy.
	prevIndex=-1;
	for(;;) {
	if(iter.nextRange() && !iter.isString()) {
	start=iter.getCodepoint();
	end=iter.getCodepointEnd();
	} else {
	start=0x10000;
	}
	i=start>>6;
	if(prevIndex!=i) {
	// Finish the end of the previous range.
	if(prevIndex<0) {
	prevIndex=0;
	} else {
	index[prevIndex++]=bitHash->map(b);
	}
	// Fill all-zero entries between ranges.
	if(prevIndex<i) {
	uint16_t zero=bitHash->map(0);
	do {
	index[prevIndex++]=zero;
	} while(prevIndex<i);
	}
	b=0;
	}
	if(start>0xffff) {
	break;
	}
	b\|=~((INT64_C(1)<<(start&0x3f))-1);
	j=end>>6;
	if(i<j) {
	// Set bits for the start of the range.
	index[i++]=bitHash->map(b);
	// Fill all-one entries inside the range.
	if(i<j) {
	uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
	do {
	index[i++]=all;
	} while(i<j);
	}
	b=INT64_C(0xffffffffffffffff);
	}
	/* i==j */
	b&=(INT64_C(1)<<(end&0x3f))-1;
	prevIndex=j;
	}

	if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
	bits=(int64_t )uprv_malloc(bitHash->countKeys()8);
	}
	if(bits!=NULL) {
	bitHash->invert(bits);
	} else {
	bits=shortBits;
	errorCode=U_MEMORY_ALLOCATION_ERROR;
	return;
	}

	latin1Set[0]=(uint32_t)bits[0];
	latin1Set[1]=(uint32_t)(bits[0]>>32);
	latin1Set[2]=(uint32_t)bits[1];
	latin1Set[3]=(uint32_t)(bits[1]>>32);
	latin1Set[4]=(uint32_t)bits[2];
	latin1Set[5]=(uint32_t)(bits[2]>>32);
	latin1Set[6]=(uint32_t)bits[3];
	latin1Set[7]=(uint32_t)(bits[3]>>32);

	restSet.remove(0, 0xffff);
	}

	~BitSet() {
	if(bits!=shortBits) {
	uprv_free(bits);
	}
	delete restSet;
	}

	UBool contains(UChar32 c) const {
	if((uint32_t)c<=0xff) {
	return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
	} else if((uint32_t)c<0xffff) {
	return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
	} else {
	return restSet->contains(c);
	}
	}

	private:
	uint16_t index[0x400];
	int64_t shortBits[32];
	int64_t *bits;

	uint32_t latin1Bits[8];

	UnicodeSet *restSet;
	};