| /* |
| ********************************************************************** |
| * Copyright (C) 2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * file name: bitset.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2007jan15 |
| * created by: Markus Scherer |
| * |
| * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet |
| * using a folded bit set consisting of a 1k-entry index table and a |
| * compacted array of 64-bit words. |
| * Uses a simple hash table for compaction. |
| * Uses the original set for supplementary code points. |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicont.h" |
| #include "cmemory.h" // for UPRV_LENGTHOF |
| |
| /* |
| * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. |
| * Hashes 64-bit words and maps them to 16-bit integers which are |
| * assigned in order of new incoming words for subsequent storage |
| * in a contiguous array. |
| */ |
| struct BMPBitHash : public UObject { |
| int64_t keys[0x800]; // 2k |
| uint16_t values[0x800]; |
| uint16_t reverse[0x400]; |
| uint16_t count; |
| const int32_t prime=1301; // Less than 2k. |
| |
| BMPBitHash() : count(0) { |
| // Fill values[] with 0xffff. |
| uprv_memset(values, 0xff, sizeof(values)); |
| } |
| |
| /* |
| * Map a key to an integer count. |
| * Map at most 1k=0x400 different keys with this data structure. |
| */ |
| uint16_t map(int64_t key) { |
| int32_t hash=(int32_t)(key>>55)&0x1ff; |
| hash^=(int32_t)(key>>44)&0x7ff; |
| hash^=(int32_t)(key>>33)&0x7ff; |
| hash^=(int32_t)(key>>22)&0x7ff; |
| hash^=(int32_t)(key>>11)&0x7ff; |
| hash^=(int32_t)key&0x7ff; |
| for(;;) { |
| if(values[hash]==0xffff) { |
| // Unused slot. |
| keys[hash]=key; |
| reverse[count]=hash; |
| return values[hash]=count++; |
| } else if(keys[hash]==key) { |
| // Found a slot with this key. |
| return values[hash]; |
| } else { |
| // Used slot with a different key, move to another slot. |
| hash=(hash+prime)&0x7ff; |
| } |
| } |
| } |
| |
| uint16_t countKeys() const { return count; } |
| |
| /* |
| * Invert the hash map: Fill an array of length countKeys() with the keys |
| * indexed by their mapped values. |
| */ |
| void invert(int64_t *k) const { |
| uint16_t i; |
| |
| for(i=0; i<count; ++i) { |
| k[i]=keys[reverse[i]]; |
| } |
| } |
| }; |
| |
| class BitSet : public UObject, public UnicodeContainable { |
| public: |
| BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { |
| if(U_FAILURE(errorCode)) { |
| return; |
| } |
| BMPBitHash *bitHash=new BMPBitHash; |
| if(bitHash==NULL || restSet==NULL) { |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| UnicodeSetIterator iter(set); |
| int64_t b; |
| UChar32 start, end; |
| int32_t prevIndex, i, j; |
| |
| b=0; // Not necessary but makes compilers happy. |
| prevIndex=-1; |
| for(;;) { |
| if(iter.nextRange() && !iter.isString()) { |
| start=iter.getCodepoint(); |
| end=iter.getCodepointEnd(); |
| } else { |
| start=0x10000; |
| } |
| i=start>>6; |
| if(prevIndex!=i) { |
| // Finish the end of the previous range. |
| if(prevIndex<0) { |
| prevIndex=0; |
| } else { |
| index[prevIndex++]=bitHash->map(b); |
| } |
| // Fill all-zero entries between ranges. |
| if(prevIndex<i) { |
| uint16_t zero=bitHash->map(0); |
| do { |
| index[prevIndex++]=zero; |
| } while(prevIndex<i); |
| } |
| b=0; |
| } |
| if(start>0xffff) { |
| break; |
| } |
| b|=~((INT64_C(1)<<(start&0x3f))-1); |
| j=end>>6; |
| if(i<j) { |
| // Set bits for the start of the range. |
| index[i++]=bitHash->map(b); |
| // Fill all-one entries inside the range. |
| if(i<j) { |
| uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); |
| do { |
| index[i++]=all; |
| } while(i<j); |
| } |
| b=INT64_C(0xffffffffffffffff); |
| } |
| /* i==j */ |
| b&=(INT64_C(1)<<(end&0x3f))-1; |
| prevIndex=j; |
| } |
| |
| if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) { |
| bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); |
| } |
| if(bits!=NULL) { |
| bitHash->invert(bits); |
| } else { |
| bits=shortBits; |
| errorCode=U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| latin1Set[0]=(uint32_t)bits[0]; |
| latin1Set[1]=(uint32_t)(bits[0]>>32); |
| latin1Set[2]=(uint32_t)bits[1]; |
| latin1Set[3]=(uint32_t)(bits[1]>>32); |
| latin1Set[4]=(uint32_t)bits[2]; |
| latin1Set[5]=(uint32_t)(bits[2]>>32); |
| latin1Set[6]=(uint32_t)bits[3]; |
| latin1Set[7]=(uint32_t)(bits[3]>>32); |
| |
| restSet.remove(0, 0xffff); |
| } |
| |
| ~BitSet() { |
| if(bits!=shortBits) { |
| uprv_free(bits); |
| } |
| delete restSet; |
| } |
| |
| UBool contains(UChar32 c) const { |
| if((uint32_t)c<=0xff) { |
| return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); |
| } else if((uint32_t)c<0xffff) { |
| return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); |
| } else { |
| return restSet->contains(c); |
| } |
| } |
| |
| private: |
| uint16_t index[0x400]; |
| int64_t shortBits[32]; |
| int64_t *bits; |
| |
| uint32_t latin1Bits[8]; |
| |
| UnicodeSet *restSet; |
| }; |