| /* |
| ******************************************************************************* |
| * Copyright (C) 2009-2012, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| |
| /** |
| * \file |
| * \brief C API: AlphabeticIndex class |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION |
| |
| #include "unicode/alphaindex.h" |
| #include "unicode/coll.h" |
| #include "unicode/normalizer2.h" |
| #include "unicode/strenum.h" |
| #include "unicode/tblcoll.h" |
| #include "unicode/ulocdata.h" |
| #include "unicode/uniset.h" |
| #include "unicode/uobject.h" |
| #include "unicode/uscript.h" |
| #include "unicode/usetiter.h" |
| #include "unicode/ustring.h" |
| #include "unicode/utf16.h" |
| |
| #include "cstring.h" |
| #include "mutex.h" |
| #include "uassert.h" |
| #include "ucln_in.h" |
| #include "uhash.h" |
| #include "uvector.h" |
| |
| //#include <string> |
| //#include <iostream> |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex) |
| |
| // Forward Declarations |
| static int32_t U_CALLCONV |
| PreferenceComparator(const void *context, const void *left, const void *right); |
| |
| static int32_t U_CALLCONV |
| sortCollateComparator(const void *context, const void *left, const void *right); |
| |
| static int32_t U_CALLCONV |
| recordCompareFn(const void *context, const void *left, const void *right); |
| |
| // UVector<Bucket *> support function, delete a Bucket. |
| static void U_CALLCONV |
| alphaIndex_deleteBucket(void *obj) { |
| delete static_cast<AlphabeticIndex::Bucket *>(obj); |
| } |
| |
| // UVector<Record *> support function, delete a Record. |
| static void U_CALLCONV |
| alphaIndex_deleteRecord(void *obj) { |
| delete static_cast<AlphabeticIndex::Record *>(obj); |
| } |
| |
| |
| |
| static const Normalizer2 *nfkdNormalizer; |
| |
| // |
| // Append the contents of a UnicodeSet to a UVector of UnicodeStrings. |
| // Append everything - individual characters are handled as strings of length 1. |
| // The destination vector owns the appended strings. |
| |
| static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) { |
| UnicodeSetIterator setIter(source); |
| while (setIter.next()) { |
| const UnicodeString &str = setIter.getString(); |
| dest.addElement(str.clone(), status); |
| } |
| } |
| |
| |
| AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| locale_ = locale; |
| langType_ = langTypeFromLocale(locale_); |
| |
| collator_ = Collator::createInstance(locale, status); |
| if (collator_ != NULL) { |
| collatorPrimaryOnly_ = collator_->clone(); |
| } |
| if (collatorPrimaryOnly_ != NULL) { |
| collatorPrimaryOnly_->setStrength(Collator::PRIMARY); |
| } |
| getIndexExemplars(*initialLabels_, locale, status); |
| indexBuildRequired_ = TRUE; |
| if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| firstScriptCharacters_ = firstStringsInScript(status); |
| } |
| |
| |
| AlphabeticIndex::~AlphabeticIndex() { |
| uhash_close(alreadyIn_); |
| delete bucketList_; |
| delete collator_; |
| delete collatorPrimaryOnly_; |
| delete firstScriptCharacters_; |
| delete labels_; |
| delete inputRecords_; |
| delete noDistinctSorting_; |
| delete notAlphabetic_; |
| delete initialLabels_; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| initialLabels_->addAll(additions); |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| UnicodeSet additions; |
| getIndexExemplars(additions, locale, status); |
| initialLabels_->addAll(additions); |
| return *this; |
| } |
| |
| |
| int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { |
| buildIndex(status); |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| return bucketList_->size(); |
| } |
| |
| |
| int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| return inputRecords_->size(); |
| } |
| |
| |
| void AlphabeticIndex::buildIndex(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| if (!indexBuildRequired_) { |
| return; |
| } |
| |
| // Discard any already-built data. |
| // This is important when the user builds and uses an index, then subsequently modifies it, |
| // necessitating a rebuild. |
| |
| bucketList_->removeAllElements(); |
| labels_->removeAllElements(); |
| uhash_removeAll(alreadyIn_); |
| noDistinctSorting_->clear(); |
| notAlphabetic_->clear(); |
| |
| // first sort the incoming Labels, with a "best" ordering among items |
| // that are the same according to the collator |
| |
| UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector. |
| preferenceSorting.setDeleter(uprv_deleteUObject); |
| appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); |
| preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); |
| |
| // We now make a set of Labels. |
| // Some of the input may, however, be redundant. |
| // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" |
| // So we make a pass through, filtering out those cases. |
| // TODO: filtering these out would seem to be at odds with the eventual goal |
| // of being able to split buckets that contain too many items. |
| |
| UnicodeSet labelSet; |
| for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) { |
| UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex)); |
| // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet, |
| // is it even possible for duplicates to show up in this check? |
| if (labelSet.contains(item)) { |
| UnicodeSetIterator itemAlreadyInIter(labelSet); |
| while (itemAlreadyInIter.next()) { |
| const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); |
| if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { |
| UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn)); |
| if (targets == NULL) { |
| // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>()); |
| targets = new UnicodeSet(); |
| uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); |
| } |
| targets->add(item); |
| break; |
| } |
| } |
| } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point. |
| collatorPrimaryOnly_->compare(item, separated(item)) == 0) { |
| noDistinctSorting_->add(item); |
| } else if (!ALPHABETIC->containsSome(item)) { |
| notAlphabetic_->add(item); |
| } else { |
| labelSet.add(item); |
| } |
| } |
| |
| // If we have no labels, hard-code a fallback default set of [A-Z] |
| // This case can occur with locales that don't have exemplar character data, including root. |
| // A no-labels situation will cause other problems; it needs to be avoided. |
| if (labelSet.isEmpty()) { |
| labelSet.add((UChar32)0x41, (UChar32)0x5A); |
| } |
| |
| // Move the set of Labels from the set into a vector, and sort |
| // according to the collator. |
| |
| appendUnicodeSetToUVector(*labels_, labelSet, status); |
| labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); |
| |
| // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element |
| // Implemented by copying the elements to be retained to a new UVector. |
| |
| const int32_t size = labelSet.size() - 1; |
| if (size > maxLabelCount_) { |
| UVector *newLabels = new UVector(status); |
| newLabels->setDeleter(uprv_deleteUObject); |
| int32_t count = 0; |
| int32_t old = -1; |
| for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) { |
| const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex)); |
| ++count; |
| const int32_t bump = count * maxLabelCount_ / size; |
| if (bump == old) { |
| // it.remove(); |
| } else { |
| newLabels->addElement(str->clone(), status); |
| old = bump; |
| } |
| } |
| delete labels_; |
| labels_ = newLabels; |
| } |
| |
| // We now know the list of labels. |
| // Create a corresponding list of buckets, one per label. |
| |
| buildBucketList(status); // Corresponds to Java BucketList constructor. |
| |
| // Bin the Records into the Buckets. |
| bucketRecords(status); |
| |
| indexBuildRequired_ = FALSE; |
| resetBucketIterator(status); |
| } |
| |
| // |
| // buildBucketList() Corresponds to the BucketList constructor in the Java version. |
| |
| void AlphabeticIndex::buildBucketList(UErrorCode &status) { |
| UnicodeString labelStr = getUnderflowLabel(); |
| Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status); |
| bucketList_->addElement(b, status); |
| |
| // Build up the list, adding underflow, additions, overflow |
| // insert infix labels as needed, using \uFFFF. |
| const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0)); |
| b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status); |
| bucketList_->addElement(b, status); |
| |
| UnicodeSet lastSet; |
| UnicodeSet set; |
| AlphabeticIndex::getScriptSet(lastSet, *last, status); |
| lastSet.removeAll(*IGNORE_SCRIPTS); |
| |
| for (int i = 1; i < labels_->size(); ++i) { |
| UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i)); |
| getScriptSet(set, *current, status); |
| set.removeAll(*IGNORE_SCRIPTS); |
| if (lastSet.containsNone(set)) { |
| // check for adjacent |
| const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status); |
| if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) { |
| labelStr = getInflowLabel(); |
| b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status); |
| bucketList_->addElement(b, status); |
| i++; |
| lastSet = set; |
| } |
| } |
| b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status); |
| bucketList_->addElement(b, status); |
| last = current; |
| lastSet = set; |
| } |
| const UnicodeString &limitString = getOverflowComparisonString(*last, status); |
| b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status); |
| bucketList_->addElement(b, status); |
| // final overflow bucket |
| } |
| |
| |
| // |
| // Place all of the raw input records into the correct bucket. |
| // |
| // Begin by sorting the input records; this lets us bin them in a single pass. |
| // |
| // Note on storage management: The input records are owned by the |
| // inputRecords_ vector, and will (eventually) be auto-deleted by it. |
| // The Bucket objects have pointers to the Record objects, but do not own them. |
| // |
| void AlphabeticIndex::bucketRecords(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| inputRecords_->sortWithUComparator(recordCompareFn, collator_, status); |
| U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow |
| // bucket, even if no user labels. |
| int32_t bucketIndex = 0; |
| Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex)); |
| Bucket *nextBucket = NULL; |
| if (bucketIndex+1 < bucketList_->size()) { |
| nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1)); |
| } |
| int32_t recordIndex = 0; |
| Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex)); |
| while (recordIndex < inputRecords_->size()) { |
| if (nextBucket == NULL || |
| collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) { |
| // Record goes in current bucket. Advance to next record, |
| // stay on current bucket. |
| destBucket->records_->addElement(r, status); |
| ++recordIndex; |
| r = static_cast<Record *>(inputRecords_->elementAt(recordIndex)); |
| } else { |
| // Advance to the next bucket, stay on current record. |
| bucketIndex++; |
| destBucket = nextBucket; |
| if (bucketIndex+1 < bucketList_->size()) { |
| nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1)); |
| } else { |
| nextBucket = NULL; |
| } |
| U_ASSERT(destBucket != NULL); |
| } |
| } |
| |
| } |
| |
| |
| void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); |
| UnicodeSet exemplars; |
| ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); |
| if (U_SUCCESS(status)) { |
| dest.addAll(exemplars); |
| return; |
| } |
| status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR |
| |
| // Locale data did not include explicit Index characters. |
| // Synthesize a set of them from the locale's standard exemplar characters. |
| |
| ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| // Upper-case any that aren't already so. |
| // (We only do this for synthesized index characters.) |
| |
| UnicodeSetIterator it(exemplars); |
| UnicodeString upperC; |
| UnicodeSet lowersToRemove; |
| UnicodeSet uppersToAdd; |
| while (it.next()) { |
| const UnicodeString &exemplarC = it.getString(); |
| upperC = exemplarC; |
| upperC.toUpper(locale); |
| if (exemplarC != upperC) { |
| lowersToRemove.add(exemplarC); |
| uppersToAdd.add(upperC); |
| } |
| } |
| exemplars.removeAll(lowersToRemove); |
| exemplars.addAll(uppersToAdd); |
| |
| // get the exemplars, and handle special cases |
| |
| // question: should we add auxiliary exemplars? |
| if (exemplars.containsSome(*CORE_LATIN)) { |
| exemplars.addAll(*CORE_LATIN); |
| } |
| if (exemplars.containsSome(*HANGUL)) { |
| // cut down to small list |
| UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status); |
| exemplars.removeAll(BLOCK_HANGUL_SYLLABLES); |
| exemplars.addAll(*HANGUL); |
| } |
| if (exemplars.containsSome(*ETHIOPIC)) { |
| // cut down to small list |
| // make use of the fact that Ethiopic is allocated in 8's, where |
| // the base is 0 mod 8. |
| UnicodeSetIterator it(*ETHIOPIC); |
| while (it.next() && !it.isString()) { |
| if ((it.getCodepoint() & 0x7) != 0) { |
| exemplars.remove(it.getCodepoint()); |
| } |
| } |
| } |
| dest.addAll(exemplars); |
| } |
| |
| |
| /* |
| * Return the string with interspersed CGJs. Input must have more than 2 codepoints. |
| */ |
| static const UChar32 CGJ = (UChar)0x034F; |
| UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { |
| UnicodeString result; |
| if (item.length() == 0) { |
| return result; |
| } |
| int32_t i = 0; |
| for (;;) { |
| UChar32 cp = item.char32At(i); |
| result.append(cp); |
| i = item.moveIndex32(i, 1); |
| if (i >= item.length()) { |
| break; |
| } |
| result.append(CGJ); |
| } |
| return result; |
| } |
| |
| |
| UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { |
| return FALSE; |
| } |
| |
| |
| UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { |
| return FALSE; |
| } |
| |
| |
| const RuleBasedCollator &AlphabeticIndex::getCollator() const { |
| // There are no known non-RuleBasedCollator collators, and none ever expected. |
| // But, in case that changes, better a null pointer than a wrong type. |
| return *dynamic_cast<RuleBasedCollator *>(collator_); |
| } |
| |
| |
| const UnicodeString &AlphabeticIndex::getInflowLabel() const { |
| return inflowLabel_; |
| } |
| |
| const UnicodeString &AlphabeticIndex::getOverflowLabel() const { |
| return overflowLabel_; |
| } |
| |
| |
| const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { |
| return underflowLabel_; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
| inflowLabel_ = label; |
| indexBuildRequired_ = TRUE; |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
| overflowLabel_ = label; |
| indexBuildRequired_ = TRUE; |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { |
| underflowLabel_ = label; |
| indexBuildRequired_ = TRUE; |
| return *this; |
| } |
| |
| |
| int32_t AlphabeticIndex::getMaxLabelCount() const { |
| return maxLabelCount_; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| if (maxLabelCount <= 0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return *this; |
| } |
| maxLabelCount_ = maxLabelCount; |
| if (maxLabelCount < bucketList_->size()) { |
| indexBuildRequired_ = TRUE; |
| } |
| return *this; |
| } |
| |
| |
| const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) { |
| for (int32_t i=0; i<firstScriptCharacters_->size(); i++) { |
| const UnicodeString *s = |
| static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i)); |
| if (collator_->compare(*s, lowerLimit) > 0) { |
| return *s; |
| } |
| } |
| return *EMPTY_STRING; |
| } |
| |
| UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return &dest; |
| } |
| UChar32 cp = codePoint.char32At(0); |
| UScriptCode scriptCode = uscript_getScript(cp, &status); |
| dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status); |
| return &dest; |
| } |
| |
| // |
| // init() - Common code for constructors. |
| // |
| |
| void AlphabeticIndex::init(UErrorCode &status) { |
| // Initialize statics if needed. |
| AlphabeticIndex::staticInit(status); |
| |
| // Put the object into a known state so that the destructor will function. |
| |
| alreadyIn_ = NULL; |
| bucketList_ = NULL; |
| collator_ = NULL; |
| collatorPrimaryOnly_ = NULL; |
| currentBucket_ = NULL; |
| firstScriptCharacters_ = NULL; |
| initialLabels_ = NULL; |
| indexBuildRequired_ = TRUE; |
| inputRecords_ = NULL; |
| itemsIterIndex_ = 0; |
| labels_ = NULL; |
| labelsIterIndex_ = 0; |
| maxLabelCount_ = 99; |
| noDistinctSorting_ = NULL; |
| notAlphabetic_ = NULL; |
| recordCounter_ = 0; |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash, |
| uhash_compareUnicodeString, // key Comparator, |
| NULL, // value Comparator |
| &status); |
| uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject); |
| uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject); |
| |
| bucketList_ = new UVector(status); |
| bucketList_->setDeleter(alphaIndex_deleteBucket); |
| labels_ = new UVector(status); |
| labels_->setDeleter(uprv_deleteUObject); |
| labels_->setComparer(uhash_compareUnicodeString); |
| inputRecords_ = new UVector(status); |
| inputRecords_->setDeleter(alphaIndex_deleteRecord); |
| |
| noDistinctSorting_ = new UnicodeSet(); |
| notAlphabetic_ = new UnicodeSet(); |
| initialLabels_ = new UnicodeSet(); |
| |
| inflowLabel_.remove(); |
| inflowLabel_.append((UChar)0x2026); // Ellipsis |
| overflowLabel_ = inflowLabel_; |
| underflowLabel_ = inflowLabel_; |
| |
| // TODO: check for memory allocation failures. |
| } |
| |
| |
| static UBool indexCharactersAreInitialized = FALSE; |
| |
| // Index Characters Clean up function. Delete statically allocated constant stuff. |
| U_CDECL_BEGIN |
| static UBool U_CALLCONV indexCharacters_cleanup(void) { |
| AlphabeticIndex::staticCleanup(); |
| return TRUE; |
| } |
| U_CDECL_END |
| |
| void AlphabeticIndex::staticCleanup() { |
| delete ALPHABETIC; |
| ALPHABETIC = NULL; |
| delete HANGUL; |
| HANGUL = NULL; |
| delete ETHIOPIC; |
| ETHIOPIC = NULL; |
| delete CORE_LATIN; |
| CORE_LATIN = NULL; |
| delete IGNORE_SCRIPTS; |
| IGNORE_SCRIPTS = NULL; |
| delete TO_TRY; |
| TO_TRY = NULL; |
| delete UNIHAN; |
| UNIHAN = NULL; |
| delete EMPTY_STRING; |
| EMPTY_STRING = NULL; |
| nfkdNormalizer = NULL; // ref to a singleton. Do not delete. |
| indexCharactersAreInitialized = FALSE; |
| } |
| |
| |
| UnicodeSet *AlphabeticIndex::ALPHABETIC; |
| UnicodeSet *AlphabeticIndex::HANGUL; |
| UnicodeSet *AlphabeticIndex::ETHIOPIC; |
| UnicodeSet *AlphabeticIndex::CORE_LATIN; |
| UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS; |
| UnicodeSet *AlphabeticIndex::TO_TRY; |
| UnicodeSet *AlphabeticIndex::UNIHAN; |
| const UnicodeString *AlphabeticIndex::EMPTY_STRING; |
| |
| // |
| // staticInit() One-time initialization of constants. |
| // Thread safe. Called from constructors. |
| // Mutex overhead is not a concern. AlphabeticIndex constructors are |
| // sufficiently heavy that the cost of the mutex check is not significant. |
| |
| void AlphabeticIndex::staticInit(UErrorCode &status) { |
| static UMutex IndexCharsInitMutex = U_MUTEX_INITIALIZER; |
| |
| Mutex mutex(&IndexCharsInitMutex); |
| if (indexCharactersAreInitialized || U_FAILURE(status)) { |
| return; |
| } |
| UBool finishedInit = FALSE; |
| |
| { |
| UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]"); |
| ALPHABETIC = new UnicodeSet(alphaString, status); |
| if (ALPHABETIC == NULL) { |
| goto err; |
| } |
| |
| HANGUL = new UnicodeSet(); |
| HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC). |
| add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558); |
| if (HANGUL== NULL) { |
| goto err; |
| } |
| |
| |
| UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); |
| ETHIOPIC = new UnicodeSet(EthiopicStr, status); |
| if (ETHIOPIC == NULL) { |
| goto err; |
| } |
| |
| CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z'); |
| if (CORE_LATIN == NULL) { |
| goto err; |
| } |
| |
| UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE( |
| "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]"); |
| IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status); |
| IGNORE_SCRIPTS->freeze(); |
| if (IGNORE_SCRIPTS == NULL) { |
| goto err; |
| } |
| |
| UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]"); |
| TO_TRY = new UnicodeSet(nfcqcStr, status); |
| if (TO_TRY == NULL) { |
| goto err; |
| } |
| |
| UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]"); |
| UNIHAN = new UnicodeSet(unihanStr, status); |
| if (UNIHAN == NULL) { |
| goto err; |
| } |
| |
| EMPTY_STRING = new UnicodeString(); |
| |
| nfkdNormalizer = Normalizer2::getNFKDInstance(status); |
| if (nfkdNormalizer == NULL) { |
| goto err; |
| } |
| } |
| finishedInit = TRUE; |
| |
| err: |
| if (!finishedInit && U_SUCCESS(status)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| if (U_FAILURE(status)) { |
| indexCharacters_cleanup(); |
| return; |
| } |
| ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup); |
| indexCharactersAreInitialized = TRUE; |
| } |
| |
| |
| // |
| // Comparison function for UVector<UnicodeString *> sorting with a collator. |
| // |
| static int32_t U_CALLCONV |
| sortCollateComparator(const void *context, const void *left, const void *right) { |
| const UElement *leftElement = static_cast<const UElement *>(left); |
| const UElement *rightElement = static_cast<const UElement *>(right); |
| const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); |
| const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); |
| const Collator *col = static_cast<const Collator *>(context); |
| |
| if (leftString == rightString) { |
| // Catches case where both are NULL |
| return 0; |
| } |
| if (leftString == NULL) { |
| return 1; |
| }; |
| if (rightString == NULL) { |
| return -1; |
| } |
| Collator::EComparisonResult r = col->compare(*leftString, *rightString); |
| return (int32_t) r; |
| } |
| |
| // |
| // Comparison function for UVector<Record *> sorting with a collator. |
| // |
| static int32_t U_CALLCONV |
| recordCompareFn(const void *context, const void *left, const void *right) { |
| const UElement *leftElement = static_cast<const UElement *>(left); |
| const UElement *rightElement = static_cast<const UElement *>(right); |
| const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); |
| const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); |
| const Collator *col = static_cast<const Collator *>(context); |
| |
| Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_); |
| if (r == Collator::EQUAL) { |
| if (leftRec->serialNumber_ < rightRec->serialNumber_) { |
| r = Collator::LESS; |
| } else if (leftRec->serialNumber_ > rightRec->serialNumber_) { |
| r = Collator::GREATER; |
| } |
| } |
| return (int32_t) r; |
| } |
| |
| |
| #if 0 |
| // |
| // First characters in scripts. |
| // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. |
| // The vector is sorted according to this index's collation. |
| // |
| // This code is too slow to use, so for now hard code the data. |
| // Hard coded implementation is follows. |
| // |
| UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) { |
| |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| |
| UnicodeString results[USCRIPT_CODE_LIMIT]; |
| UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a"); |
| |
| UnicodeSetIterator siter(*TO_TRY); |
| while (siter.next()) { |
| const UnicodeString ¤t = siter.getString(); |
| Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A); |
| if (r < 0) { // TODO fix; we only want "real" script characters, not |
| // symbols. |
| continue; |
| } |
| |
| int script = uscript_getScript(current.char32At(0), &status); |
| if (results[script].length() == 0) { |
| results[script] = current; |
| } |
| else if (ruleBasedCollator->compare(current, results[script]) < 0) { |
| results[script] = current; |
| } |
| } |
| |
| UnicodeSet extras; |
| UnicodeSet expansions; |
| RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator); |
| const UCollator *uRuleBasedCollator = rbc->getUCollator(); |
| ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status); |
| extras.addAll(expansions).removeAll(*TO_TRY); |
| if (extras.size() != 0) { |
| const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status); |
| UnicodeSetIterator extrasIter(extras); |
| while (extrasIter.next()) { |
| const UnicodeString ¤t = extrasIter.next(); |
| if (!TO_TRY->containsAll(current)) |
| continue; |
| if (!normalizer->isNormalized(current, status) || |
| ruleBasedCollator->compare(current, LOWER_A) < 0) { |
| continue; |
| } |
| int script = uscript_getScript(current.char32At(0), &status); |
| if (results[script].length() == 0) { |
| results[script] = current; |
| } else if (ruleBasedCollator->compare(current, results[script]) < 0) { |
| results[script] = current; |
| } |
| } |
| } |
| |
| UVector *dest = new UVector(status); |
| dest->setDeleter(uprv_deleteUObject); |
| for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) { |
| if (results[i].length() > 0) { |
| dest->addElement(results[i].clone(), status); |
| } |
| } |
| dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status); |
| return dest; |
| } |
| #endif |
| |
| |
| // |
| // First characters in scripts. |
| // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. |
| // The vector is sorted according to this index's collation. |
| // |
| // It takes too much time to compute this from character properties, so hard code it for now. |
| // Character constants copied from corresponding declaration in ICU4J. |
| // See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java |
| |
| static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0, |
| 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, |
| 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, |
| 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, |
| 0xAAF2, 0, // Meetei Mayek |
| 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, |
| U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada |
| U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri |
| 0x1B83, 0, |
| 0xD802, 0xDE00, 0, 0x0E01, 0, |
| 0x0EDE, 0, // Lao |
| 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, |
| 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, |
| U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma |
| 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, |
| 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, |
| 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, |
| U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao |
| 0xD800, 0xDE80, 0, |
| 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, |
| 0xD801, 0xDC80, 0, |
| U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng |
| 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, |
| 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, |
| U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive |
| U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs |
| 0x4E00, 0 }; |
| |
| UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| UVector *dest = new UVector(status); |
| if (dest == NULL) { |
| if (U_SUCCESS(status)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| return NULL; |
| } |
| dest->setDeleter(uprv_deleteUObject); |
| const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; |
| const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]); |
| do { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| UnicodeString *str = new UnicodeString(src, -1); |
| if (str == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } else { |
| dest->addElement(str, status); |
| src += str->length() + 1; |
| } |
| } while (src < limit); |
| dest->sortWithUComparator(sortCollateComparator, collator_, status); |
| return dest; |
| } |
| |
| |
| AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) { |
| const char *lang = loc.getLanguage(); |
| if (uprv_strcmp(lang, "zh") != 0) { |
| return kNormal; |
| } |
| const char *script = loc.getScript(); |
| if (uprv_strcmp(script, "Hant") == 0) { |
| return kTraditional; |
| } |
| const char *country = loc.getCountry(); |
| if (uprv_strcmp(country, "TW") == 0) { |
| return kTraditional; |
| } |
| return kSimplified; |
| } |
| |
| |
| // |
| // Pinyin Hacks. Direct port from Java. |
| // |
| |
| static const UChar32 probeCharInLong = 0x28EAD; |
| |
| |
| static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz" |
| 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, |
| /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; |
| |
| |
| // Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code. |
| |
| AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = { |
| {(UChar)0, (UChar)0, (UChar)0}, // A |
| {(UChar)0x516B, (UChar)0, (UChar)0}, // B |
| {(UChar)0x5693, (UChar)0, (UChar)0}, // C |
| {(UChar)0x5491, (UChar)0, (UChar)0}, // D |
| {(UChar)0x59B8, (UChar)0, (UChar)0}, // E |
| {(UChar)0x53D1, (UChar)0, (UChar)0}, // F |
| {(UChar)0x65EE, (UChar)0, (UChar)0}, // G |
| {(UChar)0x54C8, (UChar)0, (UChar)0}, // H |
| {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J |
| {(UChar)0x5494, (UChar)0, (UChar)0}, // K |
| {(UChar)0x5783, (UChar)0, (UChar)0}, // L |
| {(UChar)0x5452, (UChar)0, (UChar)0}, // M |
| {(UChar)0x5514, (UChar)0, (UChar)0}, // N |
| {(UChar)0x5594, (UChar)0, (UChar)0}, // O |
| {(UChar)0x5991, (UChar)0, (UChar)0}, // P |
| {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q |
| {(UChar)0x513F, (UChar)0, (UChar)0}, // R |
| {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S |
| {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T |
| {(UChar)0x7A75, (UChar)0, (UChar)0}, // W |
| {(UChar)0x5915, (UChar)0, (UChar)0}, // X |
| {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y |
| {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z |
| {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array |
| }; |
| |
| static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"; |
| 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, |
| /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; |
| |
| AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = { |
| {(UChar)0, (UChar)0, (UChar)0}, // A |
| {(UChar)0x516B, (UChar)0, (UChar)0}, // b |
| {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c |
| {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d |
| {(UChar)0x59B8, (UChar)0, (UChar)0}, // e |
| {(UChar)0x53D1, (UChar)0, (UChar)0}, // f |
| {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g |
| {(UChar)0x54C8, (UChar)0, (UChar)0}, // h |
| {(UChar)0x4E0C, (UChar)0, (UChar)0}, // j |
| {(UChar)0x5494, (UChar)0, (UChar)0}, // k |
| {(UChar)0x3547, (UChar)0, (UChar)0}, // l |
| {(UChar)0x5452, (UChar)0, (UChar)0}, // m |
| {(UChar)0x5514, (UChar)0, (UChar)0}, // n |
| {(UChar)0x5594, (UChar)0, (UChar)0}, // o |
| {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p |
| {(UChar)0x4E03, (UChar)0, (UChar)0}, // q |
| {(UChar)0x513F, (UChar)0, (UChar)0}, // r |
| {(UChar)0x4EE8, (UChar)0, (UChar)0}, // s |
| {(UChar)0x4ED6, (UChar)0, (UChar)0}, // t |
| {(UChar)0x7A75, (UChar)0, (UChar)0}, // w |
| {(UChar)0x5915, (UChar)0, (UChar)0}, // x |
| {(UChar)0x4E2B, (UChar)0, (UChar)0}, // y |
| {(UChar)0x5E00, (UChar)0, (UChar)0}, // z |
| {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array |
| }; |
| |
| |
| // |
| // Probe the collation data, and decide which Pinyin tables should be used |
| // |
| // ICU can be built with a choice between two Chinese collations. |
| // The hack Pinyin tables to use depend on which one is in use. |
| // We can assume that any given copy of ICU will have only one of the collations available, |
| // and that there is no way, in a given process, to create two alphabetic indexes using |
| // different Chinese collations. Which means the probe can be done once |
| // and the results cached. |
| // |
| // This whole arrangement is temporary. |
| // |
| AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL; |
| const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL; |
| |
| void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) { |
| { |
| Mutex m; |
| if (PINYIN_LOWER_BOUNDS != NULL) { |
| return; |
| } |
| } |
| UnicodeSet *colSet = col->getTailoredSet(status); |
| if (U_FAILURE(status) || colSet == NULL) { |
| delete colSet; |
| if (U_SUCCESS(status)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| return; |
| } |
| UBool useLongTables = colSet->contains(probeCharInLong); |
| delete colSet; |
| { |
| Mutex m; |
| if (useLongTables) { |
| PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG; |
| HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG; |
| } else { |
| PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT; |
| HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT; |
| } |
| } |
| } |
| |
| // Pinyin Hack: |
| // Modify a Chinese name by prepending a Latin letter. The modified name is used |
| // when putting records (names) into buckets, to put the name under a Latin index heading. |
| |
| void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) { |
| |
| if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) { |
| dest = name; |
| return; |
| } |
| |
| UErrorCode status = U_ZERO_ERROR; |
| initPinyinBounds(col, status); |
| if (U_FAILURE(status)) { |
| dest = name; |
| return; |
| } |
| // TODO: use binary search |
| int index; |
| for (index=0; ; index++) { |
| if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) { |
| index--; |
| break; |
| } |
| int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1)); |
| if (compareResult < 0) { |
| index--; |
| } |
| if (compareResult <= 0) { |
| break; |
| } |
| } |
| UChar c = PINYIN_LOWER_BOUNDS[index]; |
| dest.setTo(c); |
| dest.append(name); |
| return; |
| } |
| |
| |
| |
| /** |
| * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is |
| * better, and otherwise binary order is better. |
| * |
| * For use with array sort or UVector. |
| * @param context A UErrorCode pointer. |
| * @param left A UElement pointer, which must refer to a UnicodeString * |
| * @param right A UElement pointer, which must refer to a UnicodeString * |
| */ |
| |
| static int32_t U_CALLCONV |
| PreferenceComparator(const void *context, const void *left, const void *right) { |
| const UElement *leftElement = static_cast<const UElement *>(left); |
| const UElement *rightElement = static_cast<const UElement *>(right); |
| const UnicodeString *s1 = static_cast<const UnicodeString *>(leftElement->pointer); |
| const UnicodeString *s2 = static_cast<const UnicodeString *>(rightElement->pointer); |
| UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const. |
| if (s1 == s2) { |
| return 0; |
| } |
| |
| UnicodeString n1 = nfkdNormalizer->normalize(*s1, status); |
| UnicodeString n2 = nfkdNormalizer->normalize(*s2, status); |
| int32_t result = n1.length() - n2.length(); |
| if (result != 0) { |
| return result; |
| } |
| |
| result = n1.compareCodePointOrder(n2); |
| if (result != 0) { |
| return result; |
| } |
| return s1->compareCodePointOrder(*s2); |
| } |
| |
| |
| // |
| // Constructor & Destructor for AlphabeticIndex::Record |
| // |
| // Records are internal only, instances are not directly surfaced in the public API. |
| // This class is mostly struct-like, with all public fields. |
| |
| AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data): |
| alphaIndex_(alphaIndex), name_(name), data_(data) |
| { |
| UnicodeString prefixedName; |
| alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_); |
| serialNumber_ = ++alphaIndex->recordCounter_; |
| } |
| |
| AlphabeticIndex::Record::~Record() { |
| } |
| |
| |
| AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| Record *r = new Record(this, name, data); |
| inputRecords_->addElement(r, status); |
| indexBuildRequired_ = TRUE; |
| //std::string ss; |
| //std::string ss2; |
| //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << |
| // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; |
| return *this; |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| inputRecords_->removeAllElements(); |
| indexBuildRequired_ = TRUE; |
| return *this; |
| } |
| |
| |
| int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { |
| buildIndex(status); |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| |
| // For simplified Chinese prepend a prefix to the name. |
| // For non-Chinese locales or non-Chinese names, the name is not modified. |
| |
| UnicodeString prefixedName; |
| hackName(prefixedName, name, collatorPrimaryOnly_); |
| |
| // TODO: use a binary search. |
| for (int32_t i = 0; i < bucketList_->size(); ++i) { |
| Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i)); |
| Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_); |
| if (comp < 0) { |
| return i - 1; |
| } |
| } |
| // Loop runs until we find the bucket following the one that would hold prefixedName. |
| // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather |
| // than returning from the middle. |
| |
| return bucketList_->size() - 1; |
| } |
| |
| |
| int32_t AlphabeticIndex::getBucketIndex() const { |
| return labelsIterIndex_; |
| } |
| |
| |
| UBool AlphabeticIndex::nextBucket(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (indexBuildRequired_ && currentBucket_ != NULL) { |
| status = U_ENUM_OUT_OF_SYNC_ERROR; |
| return FALSE; |
| } |
| buildIndex(status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| ++labelsIterIndex_; |
| if (labelsIterIndex_ >= bucketList_->size()) { |
| labelsIterIndex_ = bucketList_->size(); |
| return FALSE; |
| } |
| currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_)); |
| resetRecordIterator(); |
| return TRUE; |
| } |
| |
| const UnicodeString &AlphabeticIndex::getBucketLabel() const { |
| if (currentBucket_ != NULL) { |
| return currentBucket_->label_; |
| } else { |
| return *EMPTY_STRING; |
| } |
| } |
| |
| |
| UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { |
| if (currentBucket_ != NULL) { |
| return currentBucket_->labelType_; |
| } else { |
| return U_ALPHAINDEX_NORMAL; |
| } |
| } |
| |
| |
| int32_t AlphabeticIndex::getBucketRecordCount() const { |
| if (currentBucket_ != NULL) { |
| return currentBucket_->records_->size(); |
| } else { |
| return 0; |
| } |
| } |
| |
| |
| AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| buildIndex(status); |
| labelsIterIndex_ = -1; |
| currentBucket_ = NULL; |
| return *this; |
| } |
| |
| |
| UBool AlphabeticIndex::nextRecord(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (currentBucket_ == NULL) { |
| // We are trying to iterate over the items in a bucket, but there is no |
| // current bucket from the enumeration of buckets. |
| status = U_INVALID_STATE_ERROR; |
| return FALSE; |
| } |
| if (indexBuildRequired_) { |
| status = U_ENUM_OUT_OF_SYNC_ERROR; |
| return FALSE; |
| } |
| ++itemsIterIndex_; |
| if (itemsIterIndex_ >= currentBucket_->records_->size()) { |
| itemsIterIndex_ = currentBucket_->records_->size(); |
| return FALSE; |
| } |
| return TRUE; |
| } |
| |
| |
| const UnicodeString &AlphabeticIndex::getRecordName() const { |
| const UnicodeString *retStr = EMPTY_STRING; |
| if (currentBucket_ != NULL && |
| itemsIterIndex_ >= 0 && |
| itemsIterIndex_ < currentBucket_->records_->size()) { |
| Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
| retStr = &item->name_; |
| } |
| return *retStr; |
| } |
| |
| const void *AlphabeticIndex::getRecordData() const { |
| const void *retPtr = NULL; |
| if (currentBucket_ != NULL && |
| itemsIterIndex_ >= 0 && |
| itemsIterIndex_ < currentBucket_->records_->size()) { |
| Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); |
| retPtr = item->data_; |
| } |
| return retPtr; |
| } |
| |
| |
| AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { |
| itemsIterIndex_ = -1; |
| return *this; |
| } |
| |
| |
| |
| AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, |
| const UnicodeString &lowerBoundary, |
| UAlphabeticIndexLabelType type, |
| UErrorCode &status): |
| label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| records_ = new UVector(status); |
| if (records_ == NULL && U_SUCCESS(status)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| |
| |
| AlphabeticIndex::Bucket::~Bucket() { |
| delete records_; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif |