|  | /* | 
|  | ******************************************************************************* | 
|  | * Copyright (C) 2009-2012, International Business Machines Corporation and    * | 
|  | * others. All Rights Reserved.                                                * | 
|  | ******************************************************************************* | 
|  | */ | 
|  |  | 
|  | /** | 
|  | * \file | 
|  | * \brief C API: AlphabeticIndex class | 
|  | */ | 
|  |  | 
|  | #include "unicode/utypes.h" | 
|  |  | 
|  | #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION | 
|  |  | 
|  | #include "unicode/alphaindex.h" | 
|  | #include "unicode/coll.h" | 
|  | #include "unicode/normalizer2.h" | 
|  | #include "unicode/strenum.h" | 
|  | #include "unicode/tblcoll.h" | 
|  | #include "unicode/ulocdata.h" | 
|  | #include "unicode/uniset.h" | 
|  | #include "unicode/uobject.h" | 
|  | #include "unicode/uscript.h" | 
|  | #include "unicode/usetiter.h" | 
|  | #include "unicode/ustring.h" | 
|  | #include "unicode/utf16.h" | 
|  |  | 
|  | #include "cstring.h" | 
|  | #include "mutex.h" | 
|  | #include "uassert.h" | 
|  | #include "ucln_in.h" | 
|  | #include "uhash.h" | 
|  | #include "uvector.h" | 
|  |  | 
|  | //#include <string> | 
|  | //#include <iostream> | 
|  | U_NAMESPACE_BEGIN | 
|  |  | 
|  | UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex) | 
|  |  | 
|  | // Forward Declarations | 
|  | static int32_t U_CALLCONV | 
|  | PreferenceComparator(const void *context, const void *left, const void *right); | 
|  |  | 
|  | static int32_t U_CALLCONV | 
|  | sortCollateComparator(const void *context, const void *left, const void *right); | 
|  |  | 
|  | static int32_t U_CALLCONV | 
|  | recordCompareFn(const void *context, const void *left, const void *right); | 
|  |  | 
|  | //  UVector<Bucket *> support function, delete a Bucket. | 
|  | static void U_CALLCONV | 
|  | alphaIndex_deleteBucket(void *obj) { | 
|  | delete static_cast<AlphabeticIndex::Bucket *>(obj); | 
|  | } | 
|  |  | 
|  | //  UVector<Record *> support function, delete a Record. | 
|  | static void U_CALLCONV | 
|  | alphaIndex_deleteRecord(void *obj) { | 
|  | delete static_cast<AlphabeticIndex::Record *>(obj); | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | static const Normalizer2 *nfkdNormalizer; | 
|  |  | 
|  | // | 
|  | //  Append the contents of a UnicodeSet to a UVector of UnicodeStrings. | 
|  | //  Append everything - individual characters are handled as strings of length 1. | 
|  | //  The destination vector owns the appended strings. | 
|  |  | 
|  | static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) { | 
|  | UnicodeSetIterator setIter(source); | 
|  | while (setIter.next()) { | 
|  | const UnicodeString &str = setIter.getString(); | 
|  | dest.addElement(str.clone(), status); | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) { | 
|  | init(status); | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | locale_ = locale; | 
|  | langType_ = langTypeFromLocale(locale_); | 
|  |  | 
|  | collator_ = Collator::createInstance(locale, status); | 
|  | if (collator_ != NULL) { | 
|  | collatorPrimaryOnly_ = collator_->clone(); | 
|  | } | 
|  | if (collatorPrimaryOnly_ != NULL) { | 
|  | collatorPrimaryOnly_->setStrength(Collator::PRIMARY); | 
|  | } | 
|  | getIndexExemplars(*initialLabels_, locale, status); | 
|  | indexBuildRequired_ = TRUE; | 
|  | if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | } | 
|  | firstScriptCharacters_ = firstStringsInScript(status); | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex::~AlphabeticIndex() { | 
|  | uhash_close(alreadyIn_); | 
|  | delete bucketList_; | 
|  | delete collator_; | 
|  | delete collatorPrimaryOnly_; | 
|  | delete firstScriptCharacters_; | 
|  | delete labels_; | 
|  | delete inputRecords_; | 
|  | delete noDistinctSorting_; | 
|  | delete notAlphabetic_; | 
|  | delete initialLabels_; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return *this; | 
|  | } | 
|  | initialLabels_->addAll(additions); | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return *this; | 
|  | } | 
|  | UnicodeSet additions; | 
|  | getIndexExemplars(additions, locale, status); | 
|  | initialLabels_->addAll(additions); | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { | 
|  | buildIndex(status); | 
|  | if (U_FAILURE(status)) { | 
|  | return 0; | 
|  | } | 
|  | return bucketList_->size(); | 
|  | } | 
|  |  | 
|  |  | 
|  | int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return 0; | 
|  | } | 
|  | return inputRecords_->size(); | 
|  | } | 
|  |  | 
|  |  | 
|  | void AlphabeticIndex::buildIndex(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | if (!indexBuildRequired_) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Discard any already-built data. | 
|  | // This is important when the user builds and uses an index, then subsequently modifies it, | 
|  | // necessitating a rebuild. | 
|  |  | 
|  | bucketList_->removeAllElements(); | 
|  | labels_->removeAllElements(); | 
|  | uhash_removeAll(alreadyIn_); | 
|  | noDistinctSorting_->clear(); | 
|  | notAlphabetic_->clear(); | 
|  |  | 
|  | // first sort the incoming Labels, with a "best" ordering among items | 
|  | // that are the same according to the collator | 
|  |  | 
|  | UVector preferenceSorting(status);   // Vector of UnicodeStrings; owned by the vector. | 
|  | preferenceSorting.setDeleter(uprv_deleteUObject); | 
|  | appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); | 
|  | preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); | 
|  |  | 
|  | // We now make a set of Labels. | 
|  | // Some of the input may, however, be redundant. | 
|  | // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" | 
|  | // So we make a pass through, filtering out those cases. | 
|  | // TODO: filtering these out would seem to be at odds with the eventual goal | 
|  | //       of being able to split buckets that contain too many items. | 
|  |  | 
|  | UnicodeSet labelSet; | 
|  | for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) { | 
|  | UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex)); | 
|  | // TODO:  Since preferenceSorting was originally populated from the contents of a UnicodeSet, | 
|  | //        is it even possible for duplicates to show up in this check? | 
|  | if (labelSet.contains(item)) { | 
|  | UnicodeSetIterator itemAlreadyInIter(labelSet); | 
|  | while (itemAlreadyInIter.next()) { | 
|  | const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); | 
|  | if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { | 
|  | UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn)); | 
|  | if (targets == NULL) { | 
|  | // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>()); | 
|  | targets = new UnicodeSet(); | 
|  | uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); | 
|  | } | 
|  | targets->add(item); | 
|  | break; | 
|  | } | 
|  | } | 
|  | } else if (item.moveIndex32(0, 1) < item.length() &&  // Label contains more than one code point. | 
|  | collatorPrimaryOnly_->compare(item, separated(item)) == 0) { | 
|  | noDistinctSorting_->add(item); | 
|  | } else if (!ALPHABETIC->containsSome(item)) { | 
|  | notAlphabetic_->add(item); | 
|  | } else { | 
|  | labelSet.add(item); | 
|  | } | 
|  | } | 
|  |  | 
|  | // If we have no labels, hard-code a fallback default set of [A-Z] | 
|  | // This case can occur with locales that don't have exemplar character data, including root. | 
|  | // A no-labels situation will cause other problems; it needs to be avoided. | 
|  | if (labelSet.isEmpty()) { | 
|  | labelSet.add((UChar32)0x41, (UChar32)0x5A); | 
|  | } | 
|  |  | 
|  | // Move the set of Labels from the set into a vector, and sort | 
|  | // according to the collator. | 
|  |  | 
|  | appendUnicodeSetToUVector(*labels_, labelSet, status); | 
|  | labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); | 
|  |  | 
|  | // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element | 
|  | //    Implemented by copying the elements to be retained to a new UVector. | 
|  |  | 
|  | const int32_t size = labelSet.size() - 1; | 
|  | if (size > maxLabelCount_) { | 
|  | UVector *newLabels = new UVector(status); | 
|  | newLabels->setDeleter(uprv_deleteUObject); | 
|  | int32_t count = 0; | 
|  | int32_t old = -1; | 
|  | for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) { | 
|  | const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex)); | 
|  | ++count; | 
|  | const int32_t bump = count * maxLabelCount_ / size; | 
|  | if (bump == old) { | 
|  | // it.remove(); | 
|  | } else { | 
|  | newLabels->addElement(str->clone(), status); | 
|  | old = bump; | 
|  | } | 
|  | } | 
|  | delete labels_; | 
|  | labels_ = newLabels; | 
|  | } | 
|  |  | 
|  | // We now know the list of labels. | 
|  | // Create a corresponding list of buckets, one per label. | 
|  |  | 
|  | buildBucketList(status);    // Corresponds to Java BucketList constructor. | 
|  |  | 
|  | // Bin the Records into the Buckets. | 
|  | bucketRecords(status); | 
|  |  | 
|  | indexBuildRequired_ = FALSE; | 
|  | resetBucketIterator(status); | 
|  | } | 
|  |  | 
|  | // | 
|  | //  buildBucketList()    Corresponds to the BucketList constructor in the Java version. | 
|  |  | 
|  | void AlphabeticIndex::buildBucketList(UErrorCode &status) { | 
|  | UnicodeString labelStr = getUnderflowLabel(); | 
|  | Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status); | 
|  | bucketList_->addElement(b, status); | 
|  |  | 
|  | // Build up the list, adding underflow, additions, overflow | 
|  | // insert infix labels as needed, using \uFFFF. | 
|  | const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0)); | 
|  | b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status); | 
|  | bucketList_->addElement(b, status); | 
|  |  | 
|  | UnicodeSet lastSet; | 
|  | UnicodeSet set; | 
|  | AlphabeticIndex::getScriptSet(lastSet, *last, status); | 
|  | lastSet.removeAll(*IGNORE_SCRIPTS); | 
|  |  | 
|  | for (int i = 1; i < labels_->size(); ++i) { | 
|  | UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i)); | 
|  | getScriptSet(set, *current, status); | 
|  | set.removeAll(*IGNORE_SCRIPTS); | 
|  | if (lastSet.containsNone(set)) { | 
|  | // check for adjacent | 
|  | const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status); | 
|  | if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) { | 
|  | labelStr = getInflowLabel(); | 
|  | b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status); | 
|  | bucketList_->addElement(b, status); | 
|  | i++; | 
|  | lastSet = set; | 
|  | } | 
|  | } | 
|  | b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status); | 
|  | bucketList_->addElement(b, status); | 
|  | last = current; | 
|  | lastSet = set; | 
|  | } | 
|  | const UnicodeString &limitString = getOverflowComparisonString(*last, status); | 
|  | b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status); | 
|  | bucketList_->addElement(b, status); | 
|  | // final overflow bucket | 
|  | } | 
|  |  | 
|  |  | 
|  | // | 
|  | //   Place all of the raw input records into the correct bucket. | 
|  | // | 
|  | //       Begin by sorting the input records; this lets us bin them in a single pass. | 
|  | // | 
|  | //       Note on storage management:  The input records are owned by the | 
|  | //       inputRecords_ vector, and will (eventually) be auto-deleted by it. | 
|  | //       The Bucket objects have pointers to the Record objects, but do not own them. | 
|  | // | 
|  | void AlphabeticIndex::bucketRecords(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | inputRecords_->sortWithUComparator(recordCompareFn, collator_, status); | 
|  | U_ASSERT(bucketList_->size() > 0);   // Should always have at least an overflow | 
|  | //   bucket, even if no user labels. | 
|  | int32_t bucketIndex = 0; | 
|  | Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex)); | 
|  | Bucket *nextBucket = NULL; | 
|  | if (bucketIndex+1 < bucketList_->size()) { | 
|  | nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1)); | 
|  | } | 
|  | int32_t recordIndex = 0; | 
|  | Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex)); | 
|  | while (recordIndex < inputRecords_->size()) { | 
|  | if (nextBucket == NULL || | 
|  | collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) { | 
|  | // Record goes in current bucket.  Advance to next record, | 
|  | // stay on current bucket. | 
|  | destBucket->records_->addElement(r, status); | 
|  | ++recordIndex; | 
|  | r = static_cast<Record *>(inputRecords_->elementAt(recordIndex)); | 
|  | } else { | 
|  | // Advance to the next bucket, stay on current record. | 
|  | bucketIndex++; | 
|  | destBucket = nextBucket; | 
|  | if (bucketIndex+1 < bucketList_->size()) { | 
|  | nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1)); | 
|  | } else { | 
|  | nextBucket = NULL; | 
|  | } | 
|  | U_ASSERT(destBucket != NULL); | 
|  | } | 
|  | } | 
|  |  | 
|  | } | 
|  |  | 
|  |  | 
|  | void AlphabeticIndex::getIndexExemplars(UnicodeSet  &dest, const Locale &locale, UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); | 
|  | UnicodeSet exemplars; | 
|  | ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); | 
|  | if (U_SUCCESS(status)) { | 
|  | dest.addAll(exemplars); | 
|  | return; | 
|  | } | 
|  | status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR | 
|  |  | 
|  | // Locale data did not include explicit Index characters. | 
|  | // Synthesize a set of them from the locale's standard exemplar characters. | 
|  |  | 
|  | ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Upper-case any that aren't already so. | 
|  | //   (We only do this for synthesized index characters.) | 
|  |  | 
|  | UnicodeSetIterator it(exemplars); | 
|  | UnicodeString upperC; | 
|  | UnicodeSet  lowersToRemove; | 
|  | UnicodeSet  uppersToAdd; | 
|  | while (it.next()) { | 
|  | const UnicodeString &exemplarC = it.getString(); | 
|  | upperC = exemplarC; | 
|  | upperC.toUpper(locale); | 
|  | if (exemplarC != upperC) { | 
|  | lowersToRemove.add(exemplarC); | 
|  | uppersToAdd.add(upperC); | 
|  | } | 
|  | } | 
|  | exemplars.removeAll(lowersToRemove); | 
|  | exemplars.addAll(uppersToAdd); | 
|  |  | 
|  | // get the exemplars, and handle special cases | 
|  |  | 
|  | // question: should we add auxiliary exemplars? | 
|  | if (exemplars.containsSome(*CORE_LATIN)) { | 
|  | exemplars.addAll(*CORE_LATIN); | 
|  | } | 
|  | if (exemplars.containsSome(*HANGUL)) { | 
|  | // cut down to small list | 
|  | UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status); | 
|  | exemplars.removeAll(BLOCK_HANGUL_SYLLABLES); | 
|  | exemplars.addAll(*HANGUL); | 
|  | } | 
|  | if (exemplars.containsSome(*ETHIOPIC)) { | 
|  | // cut down to small list | 
|  | // make use of the fact that Ethiopic is allocated in 8's, where | 
|  | // the base is 0 mod 8. | 
|  | UnicodeSetIterator  it(*ETHIOPIC); | 
|  | while (it.next() && !it.isString()) { | 
|  | if ((it.getCodepoint() & 0x7) != 0) { | 
|  | exemplars.remove(it.getCodepoint()); | 
|  | } | 
|  | } | 
|  | } | 
|  | dest.addAll(exemplars); | 
|  | } | 
|  |  | 
|  |  | 
|  | /* | 
|  | * Return the string with interspersed CGJs. Input must have more than 2 codepoints. | 
|  | */ | 
|  | static const UChar32 CGJ = (UChar)0x034F; | 
|  | UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { | 
|  | UnicodeString result; | 
|  | if (item.length() == 0) { | 
|  | return result; | 
|  | } | 
|  | int32_t i = 0; | 
|  | for (;;) { | 
|  | UChar32  cp = item.char32At(i); | 
|  | result.append(cp); | 
|  | i = item.moveIndex32(i, 1); | 
|  | if (i >= item.length()) { | 
|  | break; | 
|  | } | 
|  | result.append(CGJ); | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  |  | 
|  | UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  |  | 
|  | UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  |  | 
|  | const RuleBasedCollator &AlphabeticIndex::getCollator() const { | 
|  | // There are no known non-RuleBasedCollator collators, and none ever expected. | 
|  | // But, in case that changes, better a null pointer than a wrong type. | 
|  | return *dynamic_cast<RuleBasedCollator *>(collator_); | 
|  | } | 
|  |  | 
|  |  | 
|  | const UnicodeString &AlphabeticIndex::getInflowLabel() const { | 
|  | return inflowLabel_; | 
|  | } | 
|  |  | 
|  | const UnicodeString &AlphabeticIndex::getOverflowLabel() const { | 
|  | return overflowLabel_; | 
|  | } | 
|  |  | 
|  |  | 
|  | const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { | 
|  | return underflowLabel_; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | 
|  | inflowLabel_ = label; | 
|  | indexBuildRequired_ = TRUE; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | 
|  | overflowLabel_ = label; | 
|  | indexBuildRequired_ = TRUE; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | 
|  | underflowLabel_ = label; | 
|  | indexBuildRequired_ = TRUE; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | int32_t AlphabeticIndex::getMaxLabelCount() const { | 
|  | return maxLabelCount_; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return *this; | 
|  | } | 
|  | if (maxLabelCount <= 0) { | 
|  | status = U_ILLEGAL_ARGUMENT_ERROR; | 
|  | return *this; | 
|  | } | 
|  | maxLabelCount_ = maxLabelCount; | 
|  | if (maxLabelCount < bucketList_->size()) { | 
|  | indexBuildRequired_ = TRUE; | 
|  | } | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) { | 
|  | for (int32_t i=0; i<firstScriptCharacters_->size(); i++) { | 
|  | const UnicodeString *s = | 
|  | static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i)); | 
|  | if (collator_->compare(*s, lowerLimit) > 0) { | 
|  | return *s; | 
|  | } | 
|  | } | 
|  | return *EMPTY_STRING; | 
|  | } | 
|  |  | 
|  | UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return &dest; | 
|  | } | 
|  | UChar32 cp = codePoint.char32At(0); | 
|  | UScriptCode scriptCode = uscript_getScript(cp, &status); | 
|  | dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status); | 
|  | return &dest; | 
|  | } | 
|  |  | 
|  | // | 
|  | //  init() - Common code for constructors. | 
|  | // | 
|  |  | 
|  | void AlphabeticIndex::init(UErrorCode &status) { | 
|  | // Initialize statics if needed. | 
|  | AlphabeticIndex::staticInit(status); | 
|  |  | 
|  | // Put the object into a known state so that the destructor will function. | 
|  |  | 
|  | alreadyIn_             = NULL; | 
|  | bucketList_            = NULL; | 
|  | collator_              = NULL; | 
|  | collatorPrimaryOnly_   = NULL; | 
|  | currentBucket_         = NULL; | 
|  | firstScriptCharacters_ = NULL; | 
|  | initialLabels_         = NULL; | 
|  | indexBuildRequired_    = TRUE; | 
|  | inputRecords_          = NULL; | 
|  | itemsIterIndex_        = 0; | 
|  | labels_                = NULL; | 
|  | labelsIterIndex_       = 0; | 
|  | maxLabelCount_         = 99; | 
|  | noDistinctSorting_     = NULL; | 
|  | notAlphabetic_         = NULL; | 
|  | recordCounter_         = 0; | 
|  |  | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | alreadyIn_             = uhash_open(uhash_hashUnicodeString,    // Key Hash, | 
|  | uhash_compareUnicodeString, // key Comparator, | 
|  | NULL,                       // value Comparator | 
|  | &status); | 
|  | uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject); | 
|  | uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject); | 
|  |  | 
|  | bucketList_            = new UVector(status); | 
|  | bucketList_->setDeleter(alphaIndex_deleteBucket); | 
|  | labels_                = new UVector(status); | 
|  | labels_->setDeleter(uprv_deleteUObject); | 
|  | labels_->setComparer(uhash_compareUnicodeString); | 
|  | inputRecords_          = new UVector(status); | 
|  | inputRecords_->setDeleter(alphaIndex_deleteRecord); | 
|  |  | 
|  | noDistinctSorting_     = new UnicodeSet(); | 
|  | notAlphabetic_         = new UnicodeSet(); | 
|  | initialLabels_         = new UnicodeSet(); | 
|  |  | 
|  | inflowLabel_.remove(); | 
|  | inflowLabel_.append((UChar)0x2026);    // Ellipsis | 
|  | overflowLabel_ = inflowLabel_; | 
|  | underflowLabel_ = inflowLabel_; | 
|  |  | 
|  | // TODO:  check for memory allocation failures. | 
|  | } | 
|  |  | 
|  |  | 
|  | static  UBool  indexCharactersAreInitialized = FALSE; | 
|  |  | 
|  | //  Index Characters Clean up function.  Delete statically allocated constant stuff. | 
|  | U_CDECL_BEGIN | 
|  | static UBool U_CALLCONV indexCharacters_cleanup(void) { | 
|  | AlphabeticIndex::staticCleanup(); | 
|  | return TRUE; | 
|  | } | 
|  | U_CDECL_END | 
|  |  | 
|  | void AlphabeticIndex::staticCleanup() { | 
|  | delete ALPHABETIC; | 
|  | ALPHABETIC = NULL; | 
|  | delete HANGUL; | 
|  | HANGUL = NULL; | 
|  | delete ETHIOPIC; | 
|  | ETHIOPIC = NULL; | 
|  | delete CORE_LATIN; | 
|  | CORE_LATIN = NULL; | 
|  | delete IGNORE_SCRIPTS; | 
|  | IGNORE_SCRIPTS = NULL; | 
|  | delete TO_TRY; | 
|  | TO_TRY = NULL; | 
|  | delete UNIHAN; | 
|  | UNIHAN = NULL; | 
|  | delete EMPTY_STRING; | 
|  | EMPTY_STRING = NULL; | 
|  | nfkdNormalizer = NULL;  // ref to a singleton.  Do not delete. | 
|  | indexCharactersAreInitialized = FALSE; | 
|  | } | 
|  |  | 
|  |  | 
|  | UnicodeSet *AlphabeticIndex::ALPHABETIC; | 
|  | UnicodeSet *AlphabeticIndex::HANGUL; | 
|  | UnicodeSet *AlphabeticIndex::ETHIOPIC; | 
|  | UnicodeSet *AlphabeticIndex::CORE_LATIN; | 
|  | UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS; | 
|  | UnicodeSet *AlphabeticIndex::TO_TRY; | 
|  | UnicodeSet *AlphabeticIndex::UNIHAN; | 
|  | const UnicodeString *AlphabeticIndex::EMPTY_STRING; | 
|  |  | 
|  | // | 
|  | //  staticInit()    One-time initialization of constants. | 
|  | //                  Thread safe.  Called from constructors. | 
|  | //                  Mutex overhead is not a concern.  AlphabeticIndex constructors are | 
|  | //                  sufficiently heavy that the cost of the mutex check is not significant. | 
|  |  | 
|  | void AlphabeticIndex::staticInit(UErrorCode &status) { | 
|  | static UMutex IndexCharsInitMutex = U_MUTEX_INITIALIZER; | 
|  |  | 
|  | Mutex mutex(&IndexCharsInitMutex); | 
|  | if (indexCharactersAreInitialized || U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | UBool finishedInit = FALSE; | 
|  |  | 
|  | { | 
|  | UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]"); | 
|  | ALPHABETIC = new UnicodeSet(alphaString, status); | 
|  | if (ALPHABETIC == NULL) { | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | HANGUL = new UnicodeSet(); | 
|  | HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC). | 
|  | add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558); | 
|  | if (HANGUL== NULL) { | 
|  | goto err; | 
|  | } | 
|  |  | 
|  |  | 
|  | UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); | 
|  | ETHIOPIC = new UnicodeSet(EthiopicStr, status); | 
|  | if (ETHIOPIC == NULL) { | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a);  // ('a', 'z'); | 
|  | if (CORE_LATIN == NULL) { | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE( | 
|  | "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]"); | 
|  | IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status); | 
|  | IGNORE_SCRIPTS->freeze(); | 
|  | if (IGNORE_SCRIPTS == NULL) { | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]"); | 
|  | TO_TRY = new UnicodeSet(nfcqcStr, status); | 
|  | if (TO_TRY == NULL) { | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]"); | 
|  | UNIHAN = new UnicodeSet(unihanStr, status); | 
|  | if (UNIHAN == NULL) { | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | EMPTY_STRING = new UnicodeString(); | 
|  |  | 
|  | nfkdNormalizer = Normalizer2::getNFKDInstance(status); | 
|  | if (nfkdNormalizer == NULL) { | 
|  | goto err; | 
|  | } | 
|  | } | 
|  | finishedInit = TRUE; | 
|  |  | 
|  | err: | 
|  | if (!finishedInit && U_SUCCESS(status)) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | } | 
|  | if (U_FAILURE(status)) { | 
|  | indexCharacters_cleanup(); | 
|  | return; | 
|  | } | 
|  | ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup); | 
|  | indexCharactersAreInitialized = TRUE; | 
|  | } | 
|  |  | 
|  |  | 
|  | // | 
|  | //  Comparison function for UVector<UnicodeString *> sorting with a collator. | 
|  | // | 
|  | static int32_t U_CALLCONV | 
|  | sortCollateComparator(const void *context, const void *left, const void *right) { | 
|  | const UElement *leftElement = static_cast<const UElement *>(left); | 
|  | const UElement *rightElement = static_cast<const UElement *>(right); | 
|  | const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer); | 
|  | const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); | 
|  | const Collator *col = static_cast<const Collator *>(context); | 
|  |  | 
|  | if (leftString == rightString) { | 
|  | // Catches case where both are NULL | 
|  | return 0; | 
|  | } | 
|  | if (leftString == NULL) { | 
|  | return 1; | 
|  | }; | 
|  | if (rightString == NULL) { | 
|  | return -1; | 
|  | } | 
|  | Collator::EComparisonResult r = col->compare(*leftString, *rightString); | 
|  | return (int32_t) r; | 
|  | } | 
|  |  | 
|  | // | 
|  | //  Comparison function for UVector<Record *> sorting with a collator. | 
|  | // | 
|  | static int32_t U_CALLCONV | 
|  | recordCompareFn(const void *context, const void *left, const void *right) { | 
|  | const UElement *leftElement = static_cast<const UElement *>(left); | 
|  | const UElement *rightElement = static_cast<const UElement *>(right); | 
|  | const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); | 
|  | const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); | 
|  | const Collator *col = static_cast<const Collator *>(context); | 
|  |  | 
|  | Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_); | 
|  | if (r == Collator::EQUAL) { | 
|  | if (leftRec->serialNumber_ < rightRec->serialNumber_) { | 
|  | r = Collator::LESS; | 
|  | } else if (leftRec->serialNumber_ > rightRec->serialNumber_) { | 
|  | r = Collator::GREATER; | 
|  | } | 
|  | } | 
|  | return (int32_t) r; | 
|  | } | 
|  |  | 
|  |  | 
|  | #if 0 | 
|  | // | 
|  | //  First characters in scripts. | 
|  | //  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. | 
|  | //  The vector is sorted according to this index's collation. | 
|  | // | 
|  | //  This code is too slow to use, so for now hard code the data. | 
|  | //    Hard coded implementation is follows. | 
|  | // | 
|  | UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) { | 
|  |  | 
|  | if (U_FAILURE(status)) { | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | UnicodeString results[USCRIPT_CODE_LIMIT]; | 
|  | UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a"); | 
|  |  | 
|  | UnicodeSetIterator siter(*TO_TRY); | 
|  | while (siter.next()) { | 
|  | const UnicodeString ¤t = siter.getString(); | 
|  | Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A); | 
|  | if (r < 0) {  // TODO fix; we only want "real" script characters, not | 
|  | // symbols. | 
|  | continue; | 
|  | } | 
|  |  | 
|  | int script = uscript_getScript(current.char32At(0), &status); | 
|  | if (results[script].length() == 0) { | 
|  | results[script] = current; | 
|  | } | 
|  | else if (ruleBasedCollator->compare(current, results[script]) < 0) { | 
|  | results[script] = current; | 
|  | } | 
|  | } | 
|  |  | 
|  | UnicodeSet extras; | 
|  | UnicodeSet expansions; | 
|  | RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator); | 
|  | const UCollator *uRuleBasedCollator = rbc->getUCollator(); | 
|  | ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status); | 
|  | extras.addAll(expansions).removeAll(*TO_TRY); | 
|  | if (extras.size() != 0) { | 
|  | const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status); | 
|  | UnicodeSetIterator extrasIter(extras); | 
|  | while (extrasIter.next()) { | 
|  | const UnicodeString ¤t = extrasIter.next(); | 
|  | if (!TO_TRY->containsAll(current)) | 
|  | continue; | 
|  | if (!normalizer->isNormalized(current, status) || | 
|  | ruleBasedCollator->compare(current, LOWER_A) < 0) { | 
|  | continue; | 
|  | } | 
|  | int script = uscript_getScript(current.char32At(0), &status); | 
|  | if (results[script].length() == 0) { | 
|  | results[script] = current; | 
|  | } else if (ruleBasedCollator->compare(current, results[script]) < 0) { | 
|  | results[script] = current; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | UVector *dest = new UVector(status); | 
|  | dest->setDeleter(uprv_deleteUObject); | 
|  | for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) { | 
|  | if (results[i].length() > 0) { | 
|  | dest->addElement(results[i].clone(), status); | 
|  | } | 
|  | } | 
|  | dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status); | 
|  | return dest; | 
|  | } | 
|  | #endif | 
|  |  | 
|  |  | 
|  | // | 
|  | //  First characters in scripts. | 
|  | //  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. | 
|  | //  The vector is sorted according to this index's collation. | 
|  | // | 
|  | //  It takes too much time to compute this from character properties, so hard code it for now. | 
|  | //  Character constants copied from corresponding declaration in ICU4J. | 
|  | //  See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java | 
|  |  | 
|  | static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  { 0x61, 0, 0x03B1, 0, | 
|  | 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, | 
|  | 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, | 
|  | 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, | 
|  | 0xAAF2, 0,  // Meetei Mayek | 
|  | 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, | 
|  | U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0,  // Sharada | 
|  | U16_LEAD(0x11680), U16_TRAIL(0x11680), 0,  // Takri | 
|  | 0x1B83, 0, | 
|  | 0xD802, 0xDE00, 0, 0x0E01, 0, | 
|  | 0x0EDE, 0,  // Lao | 
|  | 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, | 
|  | 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, | 
|  | U16_LEAD(0x11103), U16_TRAIL(0x11103), 0,  // Chakma | 
|  | 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, | 
|  | 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, | 
|  | 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, | 
|  | U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0,  // Miao | 
|  | 0xD800, 0xDE80, 0, | 
|  | 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, | 
|  | 0xD801, 0xDC80, 0, | 
|  | U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0,  // Sora Sompeng | 
|  | 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, | 
|  | 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, | 
|  | U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0,  // Meroitic Cursive | 
|  | U16_LEAD(0x10980), U16_TRAIL(0x10980), 0,  // Meroitic Hieroglyphs | 
|  | 0x4E00, 0 }; | 
|  |  | 
|  | UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return NULL; | 
|  | } | 
|  | UVector *dest = new UVector(status); | 
|  | if (dest == NULL) { | 
|  | if (U_SUCCESS(status)) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | } | 
|  | return NULL; | 
|  | } | 
|  | dest->setDeleter(uprv_deleteUObject); | 
|  | const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS; | 
|  | const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]); | 
|  | do { | 
|  | if (U_FAILURE(status)) { | 
|  | return dest; | 
|  | } | 
|  | UnicodeString *str = new UnicodeString(src, -1); | 
|  | if (str == NULL) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | } else { | 
|  | dest->addElement(str, status); | 
|  | src += str->length() + 1; | 
|  | } | 
|  | } while (src < limit); | 
|  | dest->sortWithUComparator(sortCollateComparator, collator_, status); | 
|  | return dest; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) { | 
|  | const char *lang = loc.getLanguage(); | 
|  | if (uprv_strcmp(lang, "zh") != 0) { | 
|  | return kNormal; | 
|  | } | 
|  | const char *script = loc.getScript(); | 
|  | if (uprv_strcmp(script, "Hant") == 0) { | 
|  | return kTraditional; | 
|  | } | 
|  | const char *country = loc.getCountry(); | 
|  | if (uprv_strcmp(country, "TW") == 0) { | 
|  | return kTraditional; | 
|  | } | 
|  | return kSimplified; | 
|  | } | 
|  |  | 
|  |  | 
|  | // | 
|  | // Pinyin Hacks.  Direct port from Java. | 
|  | // | 
|  |  | 
|  | static const UChar32  probeCharInLong = 0x28EAD; | 
|  |  | 
|  |  | 
|  | static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = {      // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz" | 
|  | 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, | 
|  | /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; | 
|  |  | 
|  |  | 
|  | // Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code. | 
|  |  | 
|  | AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = { | 
|  | {(UChar)0,      (UChar)0, (UChar)0}, // A | 
|  | {(UChar)0x516B, (UChar)0, (UChar)0}, // B | 
|  | {(UChar)0x5693, (UChar)0, (UChar)0}, // C | 
|  | {(UChar)0x5491, (UChar)0, (UChar)0}, // D | 
|  | {(UChar)0x59B8, (UChar)0, (UChar)0}, // E | 
|  | {(UChar)0x53D1, (UChar)0, (UChar)0}, // F | 
|  | {(UChar)0x65EE, (UChar)0, (UChar)0}, // G | 
|  | {(UChar)0x54C8, (UChar)0, (UChar)0}, // H | 
|  | {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J | 
|  | {(UChar)0x5494, (UChar)0, (UChar)0}, // K | 
|  | {(UChar)0x5783, (UChar)0, (UChar)0}, // L | 
|  | {(UChar)0x5452, (UChar)0, (UChar)0}, // M | 
|  | {(UChar)0x5514, (UChar)0, (UChar)0}, // N | 
|  | {(UChar)0x5594, (UChar)0, (UChar)0}, // O | 
|  | {(UChar)0x5991, (UChar)0, (UChar)0}, // P | 
|  | {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q | 
|  | {(UChar)0x513F, (UChar)0, (UChar)0}, // R | 
|  | {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S | 
|  | {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T | 
|  | {(UChar)0x7A75, (UChar)0, (UChar)0}, // W | 
|  | {(UChar)0x5915, (UChar)0, (UChar)0}, // X | 
|  | {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y | 
|  | {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z | 
|  | {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array | 
|  | }; | 
|  |  | 
|  | static const UChar PINYIN_LOWER_BOUNDS_LONG[] = {   // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"; | 
|  | 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, | 
|  | /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; | 
|  |  | 
|  | AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = { | 
|  | {(UChar)0,      (UChar)0,      (UChar)0}, // A | 
|  | {(UChar)0x516B, (UChar)0,      (UChar)0}, // b | 
|  | {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c | 
|  | {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d | 
|  | {(UChar)0x59B8, (UChar)0,      (UChar)0}, // e | 
|  | {(UChar)0x53D1, (UChar)0,      (UChar)0}, // f | 
|  | {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g | 
|  | {(UChar)0x54C8, (UChar)0,      (UChar)0}, // h | 
|  | {(UChar)0x4E0C, (UChar)0,      (UChar)0}, // j | 
|  | {(UChar)0x5494, (UChar)0,      (UChar)0}, // k | 
|  | {(UChar)0x3547, (UChar)0,      (UChar)0}, // l | 
|  | {(UChar)0x5452, (UChar)0,      (UChar)0}, // m | 
|  | {(UChar)0x5514, (UChar)0,      (UChar)0}, // n | 
|  | {(UChar)0x5594, (UChar)0,      (UChar)0}, // o | 
|  | {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p | 
|  | {(UChar)0x4E03, (UChar)0,      (UChar)0}, // q | 
|  | {(UChar)0x513F, (UChar)0,      (UChar)0}, // r | 
|  | {(UChar)0x4EE8, (UChar)0,      (UChar)0}, // s | 
|  | {(UChar)0x4ED6, (UChar)0,      (UChar)0}, // t | 
|  | {(UChar)0x7A75, (UChar)0,      (UChar)0}, // w | 
|  | {(UChar)0x5915, (UChar)0,      (UChar)0}, // x | 
|  | {(UChar)0x4E2B, (UChar)0,      (UChar)0}, // y | 
|  | {(UChar)0x5E00, (UChar)0,      (UChar)0}, // z | 
|  | {(UChar)0xFFFF, (UChar)0,      (UChar)0}, // mark end of array | 
|  | }; | 
|  |  | 
|  |  | 
|  | // | 
|  | //  Probe the collation data, and decide which Pinyin tables should be used | 
|  | // | 
|  | //  ICU can be built with a choice between two Chinese collations. | 
|  | //  The hack Pinyin tables to use depend on which one is in use. | 
|  | //  We can assume that any given copy of ICU will have only one of the collations available, | 
|  | //  and that there is no way, in a given process, to create two alphabetic indexes using | 
|  | //  different Chinese collations.  Which means the probe can be done once | 
|  | //  and the results cached. | 
|  | // | 
|  | //  This whole arrangement is temporary. | 
|  | // | 
|  | AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP  = NULL; | 
|  | const UChar  *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL; | 
|  |  | 
|  | void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) { | 
|  | { | 
|  | Mutex m; | 
|  | if (PINYIN_LOWER_BOUNDS != NULL) { | 
|  | return; | 
|  | } | 
|  | } | 
|  | UnicodeSet *colSet = col->getTailoredSet(status); | 
|  | if (U_FAILURE(status) || colSet == NULL) { | 
|  | delete colSet; | 
|  | if (U_SUCCESS(status))  { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | } | 
|  | return; | 
|  | } | 
|  | UBool useLongTables = colSet->contains(probeCharInLong); | 
|  | delete colSet; | 
|  | { | 
|  | Mutex m; | 
|  | if (useLongTables) { | 
|  | PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG; | 
|  | HACK_PINYIN_LOOKUP  = &HACK_PINYIN_LOOKUP_LONG; | 
|  | } else { | 
|  | PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT; | 
|  | HACK_PINYIN_LOOKUP  = &HACK_PINYIN_LOOKUP_SHORT; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Pinyin Hack: | 
|  | //    Modify a Chinese name by prepending a Latin letter.  The modified name is used | 
|  | //      when putting records (names) into buckets, to put the name under a Latin index heading. | 
|  |  | 
|  | void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) { | 
|  |  | 
|  | if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) { | 
|  | dest = name; | 
|  | return; | 
|  | } | 
|  |  | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | initPinyinBounds(col, status); | 
|  | if (U_FAILURE(status)) { | 
|  | dest = name; | 
|  | return; | 
|  | } | 
|  | // TODO:  use binary search | 
|  | int index; | 
|  | for (index=0; ; index++) { | 
|  | if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) { | 
|  | index--; | 
|  | break; | 
|  | } | 
|  | int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1)); | 
|  | if (compareResult < 0) { | 
|  | index--; | 
|  | } | 
|  | if (compareResult <= 0) { | 
|  | break; | 
|  | } | 
|  | } | 
|  | UChar c = PINYIN_LOWER_BOUNDS[index]; | 
|  | dest.setTo(c); | 
|  | dest.append(name); | 
|  | return; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | /** | 
|  | * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is | 
|  | * better, and otherwise binary order is better. | 
|  | * | 
|  | * For use with array sort or UVector. | 
|  | * @param context  A UErrorCode pointer. | 
|  | * @param left     A UElement pointer, which must refer to a UnicodeString * | 
|  | * @param right    A UElement pointer, which must refer to a UnicodeString * | 
|  | */ | 
|  |  | 
|  | static int32_t U_CALLCONV | 
|  | PreferenceComparator(const void *context, const void *left, const void *right) { | 
|  | const UElement *leftElement  = static_cast<const UElement *>(left); | 
|  | const UElement *rightElement = static_cast<const UElement *>(right); | 
|  | const UnicodeString *s1  = static_cast<const UnicodeString *>(leftElement->pointer); | 
|  | const UnicodeString *s2  = static_cast<const UnicodeString *>(rightElement->pointer); | 
|  | UErrorCode &status       = *(UErrorCode *)(context);   // Cast off both static and const. | 
|  | if (s1 == s2) { | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | UnicodeString n1 = nfkdNormalizer->normalize(*s1, status); | 
|  | UnicodeString n2 = nfkdNormalizer->normalize(*s2, status); | 
|  | int32_t result = n1.length() - n2.length(); | 
|  | if (result != 0) { | 
|  | return result; | 
|  | } | 
|  |  | 
|  | result = n1.compareCodePointOrder(n2); | 
|  | if (result != 0) { | 
|  | return result; | 
|  | } | 
|  | return s1->compareCodePointOrder(*s2); | 
|  | } | 
|  |  | 
|  |  | 
|  | // | 
|  | //  Constructor & Destructor for AlphabeticIndex::Record | 
|  | // | 
|  | //     Records are internal only, instances are not directly surfaced in the public API. | 
|  | //     This class is mostly struct-like, with all public fields. | 
|  |  | 
|  | AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data): | 
|  | alphaIndex_(alphaIndex), name_(name), data_(data) | 
|  | { | 
|  | UnicodeString prefixedName; | 
|  | alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_); | 
|  | serialNumber_ = ++alphaIndex->recordCounter_; | 
|  | } | 
|  |  | 
|  | AlphabeticIndex::Record::~Record() { | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return *this; | 
|  | } | 
|  | Record *r = new Record(this, name, data); | 
|  | inputRecords_->addElement(r, status); | 
|  | indexBuildRequired_ = TRUE; | 
|  | //std::string ss; | 
|  | //std::string ss2; | 
|  | //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << | 
|  | //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return *this; | 
|  | } | 
|  | inputRecords_->removeAllElements(); | 
|  | indexBuildRequired_ = TRUE; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { | 
|  | buildIndex(status); | 
|  | if (U_FAILURE(status)) { | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | // For simplified Chinese prepend a prefix to the name. | 
|  | //   For non-Chinese locales or non-Chinese names, the name is not modified. | 
|  |  | 
|  | UnicodeString prefixedName; | 
|  | hackName(prefixedName, name, collatorPrimaryOnly_); | 
|  |  | 
|  | // TODO:  use a binary search. | 
|  | for (int32_t i = 0; i < bucketList_->size(); ++i) { | 
|  | Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i)); | 
|  | Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_); | 
|  | if (comp < 0) { | 
|  | return i - 1; | 
|  | } | 
|  | } | 
|  | // Loop runs until we find the bucket following the one that would hold prefixedName. | 
|  | // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather | 
|  | //  than returning from the middle. | 
|  |  | 
|  | return bucketList_->size() - 1; | 
|  | } | 
|  |  | 
|  |  | 
|  | int32_t AlphabeticIndex::getBucketIndex() const { | 
|  | return labelsIterIndex_; | 
|  | } | 
|  |  | 
|  |  | 
|  | UBool AlphabeticIndex::nextBucket(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return FALSE; | 
|  | } | 
|  | if (indexBuildRequired_ && currentBucket_ != NULL) { | 
|  | status = U_ENUM_OUT_OF_SYNC_ERROR; | 
|  | return FALSE; | 
|  | } | 
|  | buildIndex(status); | 
|  | if (U_FAILURE(status)) { | 
|  | return FALSE; | 
|  | } | 
|  | ++labelsIterIndex_; | 
|  | if (labelsIterIndex_ >= bucketList_->size()) { | 
|  | labelsIterIndex_ = bucketList_->size(); | 
|  | return FALSE; | 
|  | } | 
|  | currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_)); | 
|  | resetRecordIterator(); | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | const UnicodeString &AlphabeticIndex::getBucketLabel() const { | 
|  | if (currentBucket_ != NULL) { | 
|  | return currentBucket_->label_; | 
|  | } else { | 
|  | return *EMPTY_STRING; | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  | UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { | 
|  | if (currentBucket_ != NULL) { | 
|  | return currentBucket_->labelType_; | 
|  | } else { | 
|  | return U_ALPHAINDEX_NORMAL; | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  | int32_t AlphabeticIndex::getBucketRecordCount() const { | 
|  | if (currentBucket_ != NULL) { | 
|  | return currentBucket_->records_->size(); | 
|  | } else { | 
|  | return 0; | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return *this; | 
|  | } | 
|  | buildIndex(status); | 
|  | labelsIterIndex_ = -1; | 
|  | currentBucket_ = NULL; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  | UBool AlphabeticIndex::nextRecord(UErrorCode &status) { | 
|  | if (U_FAILURE(status)) { | 
|  | return FALSE; | 
|  | } | 
|  | if (currentBucket_ == NULL) { | 
|  | // We are trying to iterate over the items in a bucket, but there is no | 
|  | // current bucket from the enumeration of buckets. | 
|  | status = U_INVALID_STATE_ERROR; | 
|  | return FALSE; | 
|  | } | 
|  | if (indexBuildRequired_) { | 
|  | status = U_ENUM_OUT_OF_SYNC_ERROR; | 
|  | return FALSE; | 
|  | } | 
|  | ++itemsIterIndex_; | 
|  | if (itemsIterIndex_ >= currentBucket_->records_->size()) { | 
|  | itemsIterIndex_  = currentBucket_->records_->size(); | 
|  | return FALSE; | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  |  | 
|  | const UnicodeString &AlphabeticIndex::getRecordName() const { | 
|  | const UnicodeString *retStr = EMPTY_STRING; | 
|  | if (currentBucket_ != NULL && | 
|  | itemsIterIndex_ >= 0 && | 
|  | itemsIterIndex_ < currentBucket_->records_->size()) { | 
|  | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); | 
|  | retStr = &item->name_; | 
|  | } | 
|  | return *retStr; | 
|  | } | 
|  |  | 
|  | const void *AlphabeticIndex::getRecordData() const { | 
|  | const void *retPtr = NULL; | 
|  | if (currentBucket_ != NULL && | 
|  | itemsIterIndex_ >= 0 && | 
|  | itemsIterIndex_ < currentBucket_->records_->size()) { | 
|  | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); | 
|  | retPtr = item->data_; | 
|  | } | 
|  | return retPtr; | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { | 
|  | itemsIterIndex_ = -1; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, | 
|  | const UnicodeString &lowerBoundary, | 
|  | UAlphabeticIndexLabelType type, | 
|  | UErrorCode &status): | 
|  | label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) { | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | records_ = new UVector(status); | 
|  | if (records_ == NULL && U_SUCCESS(status)) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  | AlphabeticIndex::Bucket::~Bucket() { | 
|  | delete records_; | 
|  | } | 
|  |  | 
|  | U_NAMESPACE_END | 
|  |  | 
|  | #endif |