| /* |
| ****************************************************************************** |
| * Copyright (C) 1996-2012, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ****************************************************************************** |
| */ |
| |
| /** |
| * File tblcoll.cpp |
| * |
| * Created by: Helena Shih |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 2/5/97 aliu Added streamIn and streamOut methods. Added |
| * constructor which reads RuleBasedCollator object from |
| * a binary file. Added writeToFile method which streams |
| * RuleBasedCollator out to a binary file. The streamIn |
| * and streamOut methods use istream and ostream objects |
| * in binary mode. |
| * 2/11/97 aliu Moved declarations out of for loop initializer. |
| * Added Mac compatibility #ifdef for ios::nocreate. |
| * 2/12/97 aliu Modified to use TableCollationData sub-object to |
| * hold invariant data. |
| * 2/13/97 aliu Moved several methods into this class from Collation. |
| * Added a private RuleBasedCollator(Locale&) constructor, |
| * to be used by Collator::getInstance(). General |
| * clean up. Made use of UErrorCode variables consistent. |
| * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy |
| * constructor and getDynamicClassID. |
| * 3/5/97 aliu Changed compaction cycle to improve performance. We |
| * use the maximum allowable value which is kBlockCount. |
| * Modified getRules() to load rules dynamically. Changed |
| * constructFromFile() call to accomodate this (added |
| * parameter to specify whether binary loading is to |
| * take place). |
| * 05/06/97 helena Added memory allocation error check. |
| * 6/20/97 helena Java class name change. |
| * 6/23/97 helena Adding comments to make code more readable. |
| * 09/03/97 helena Added createCollationKeyValues(). |
| * 06/26/98 erm Changes for CollationKeys using byte arrays. |
| * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java |
| * 04/23/99 stephen Removed EDecompositionMode, merged with |
| * Normalizer::EMode |
| * 06/14/99 stephen Removed kResourceBundleSuffix |
| * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx |
| * files are no longer used. |
| * 11/02/99 helena Collator performance enhancements. Special case |
| * for NO_OP situations. |
| * 11/17/99 srl More performance enhancements. Inlined some internal functions. |
| * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator |
| * to implementation file. |
| * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h) |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/tblcoll.h" |
| #include "unicode/coleitr.h" |
| #include "unicode/ures.h" |
| #include "unicode/uset.h" |
| #include "ucol_imp.h" |
| #include "uresimp.h" |
| #include "uhash.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "putilimp.h" |
| #include "ustr_imp.h" |
| |
| /* public RuleBasedCollator constructor ---------------------------------- */ |
| |
| U_NAMESPACE_BEGIN |
| |
| /** |
| * Copy constructor, aliasing, not write-through |
| */ |
| RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) |
| : Collator(that) |
| , dataIsOwned(FALSE) |
| , isWriteThroughAlias(FALSE) |
| , ucollator(NULL) |
| { |
| RuleBasedCollator::operator=(that); |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| UErrorCode& status) : |
| dataIsOwned(FALSE) |
| { |
| construct(rules, |
| UCOL_DEFAULT_STRENGTH, |
| UCOL_DEFAULT, |
| status); |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| ECollationStrength collationStrength, |
| UErrorCode& status) : dataIsOwned(FALSE) |
| { |
| construct(rules, |
| (UColAttributeValue)collationStrength, |
| UCOL_DEFAULT, |
| status); |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| UColAttributeValue decompositionMode, |
| UErrorCode& status) : |
| dataIsOwned(FALSE) |
| { |
| construct(rules, |
| UCOL_DEFAULT_STRENGTH, |
| decompositionMode, |
| status); |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| ECollationStrength collationStrength, |
| UColAttributeValue decompositionMode, |
| UErrorCode& status) : dataIsOwned(FALSE) |
| { |
| construct(rules, |
| (UColAttributeValue)collationStrength, |
| decompositionMode, |
| status); |
| } |
| RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, |
| const RuleBasedCollator *base, |
| UErrorCode &status) : |
| dataIsOwned(TRUE), |
| isWriteThroughAlias(FALSE) |
| { |
| ucollator = ucol_openBinary(bin, length, base->ucollator, &status); |
| } |
| |
| void |
| RuleBasedCollator::setRuleStringFromCollator() |
| { |
| int32_t length; |
| const UChar *r = ucol_getRules(ucollator, &length); |
| |
| if (r && length > 0) { |
| // alias the rules string |
| urulestring.setTo(TRUE, r, length); |
| } |
| else { |
| urulestring.truncate(0); // Clear string. |
| } |
| } |
| |
| // not aliasing, not write-through |
| void |
| RuleBasedCollator::construct(const UnicodeString& rules, |
| UColAttributeValue collationStrength, |
| UColAttributeValue decompositionMode, |
| UErrorCode& status) |
| { |
| ucollator = ucol_openRules(rules.getBuffer(), rules.length(), |
| decompositionMode, collationStrength, |
| NULL, &status); |
| |
| dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it |
| isWriteThroughAlias = FALSE; |
| |
| if(ucollator == NULL) { |
| if(U_SUCCESS(status)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| return; // Failure |
| } |
| |
| setRuleStringFromCollator(); |
| } |
| |
| /* RuleBasedCollator public destructor ----------------------------------- */ |
| |
| RuleBasedCollator::~RuleBasedCollator() |
| { |
| if (dataIsOwned) |
| { |
| ucol_close(ucollator); |
| } |
| ucollator = 0; |
| } |
| |
| /* RuleBaseCollator public methods --------------------------------------- */ |
| |
| UBool RuleBasedCollator::operator==(const Collator& that) const |
| { |
| /* only checks for address equals here */ |
| if (this == &that) { |
| return TRUE; |
| } |
| if (!Collator::operator==(that)) { |
| return FALSE; /* not the same class */ |
| } |
| |
| RuleBasedCollator& thatAlias = (RuleBasedCollator&)that; |
| |
| return ucol_equals(this->ucollator, thatAlias.ucollator); |
| } |
| |
| // aliasing, not write-through |
| RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that) |
| { |
| if (this == &that) { return *this; } |
| |
| UErrorCode intStatus = U_ZERO_ERROR; |
| int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; |
| UCollator *ucol = ucol_safeClone(that.ucollator, NULL, &buffersize, &intStatus); |
| if (U_FAILURE(intStatus)) { return *this; } |
| |
| if (dataIsOwned) { |
| ucol_close(ucollator); |
| } |
| ucollator = ucol; |
| dataIsOwned = TRUE; |
| isWriteThroughAlias = FALSE; |
| setRuleStringFromCollator(); |
| return *this; |
| } |
| |
| // aliasing, not write-through |
| Collator* RuleBasedCollator::clone() const |
| { |
| RuleBasedCollator* coll = new RuleBasedCollator(*this); |
| // There is a small chance that the internal ucol_safeClone() call fails. |
| if (coll != NULL && coll->ucollator == NULL) { |
| delete coll; |
| return NULL; |
| } |
| return coll; |
| } |
| |
| |
| CollationElementIterator* RuleBasedCollator::createCollationElementIterator |
| (const UnicodeString& source) const |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| CollationElementIterator *result = new CollationElementIterator(source, this, |
| status); |
| if (U_FAILURE(status)) { |
| delete result; |
| return NULL; |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Create a CollationElementIterator object that will iterate over the |
| * elements in a string, using the collation rules defined in this |
| * RuleBasedCollator |
| */ |
| CollationElementIterator* RuleBasedCollator::createCollationElementIterator |
| (const CharacterIterator& source) const |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| CollationElementIterator *result = new CollationElementIterator(source, this, |
| status); |
| |
| if (U_FAILURE(status)) { |
| delete result; |
| return NULL; |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Return a string representation of this collator's rules. The string can |
| * later be passed to the constructor that takes a UnicodeString argument, |
| * which will construct a collator that's functionally identical to this one. |
| * You can also allow users to edit the string in order to change the collation |
| * data, or you can print it out for inspection, or whatever. |
| */ |
| const UnicodeString& RuleBasedCollator::getRules() const |
| { |
| return urulestring; |
| } |
| |
| void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) |
| { |
| int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1); |
| |
| if (rulesize > 0) { |
| UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) ); |
| if(rules != NULL) { |
| ucol_getRulesEx(ucollator, delta, rules, rulesize); |
| buffer.setTo(rules, rulesize); |
| uprv_free(rules); |
| } else { // couldn't allocate |
| buffer.remove(); |
| } |
| } |
| else { |
| buffer.remove(); |
| } |
| } |
| |
| UnicodeSet * |
| RuleBasedCollator::getTailoredSet(UErrorCode &status) const |
| { |
| if(U_FAILURE(status)) { |
| return NULL; |
| } |
| return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status); |
| } |
| |
| |
| void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const |
| { |
| if (versionInfo!=NULL){ |
| ucol_getVersion(ucollator, versionInfo); |
| } |
| } |
| |
| /** |
| * Compare two strings using this collator |
| */ |
| UCollationResult RuleBasedCollator::compare( |
| const UnicodeString& source, |
| const UnicodeString& target, |
| int32_t length, |
| UErrorCode &status) const |
| { |
| return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status); |
| } |
| |
| UCollationResult RuleBasedCollator::compare(const UChar* source, |
| int32_t sourceLength, |
| const UChar* target, |
| int32_t targetLength, |
| UErrorCode &status) const |
| { |
| if(U_SUCCESS(status)) { |
| return ucol_strcoll(ucollator, source, sourceLength, target, targetLength); |
| } else { |
| return UCOL_EQUAL; |
| } |
| } |
| |
| UCollationResult RuleBasedCollator::compare( |
| const UnicodeString& source, |
| const UnicodeString& target, |
| UErrorCode &status) const |
| { |
| if(U_SUCCESS(status)) { |
| return ucol_strcoll(ucollator, source.getBuffer(), source.length(), |
| target.getBuffer(), target.length()); |
| } else { |
| return UCOL_EQUAL; |
| } |
| } |
| |
| UCollationResult RuleBasedCollator::compare(UCharIterator &sIter, |
| UCharIterator &tIter, |
| UErrorCode &status) const { |
| if(U_SUCCESS(status)) { |
| return ucol_strcollIter(ucollator, &sIter, &tIter, &status); |
| } else { |
| return UCOL_EQUAL; |
| } |
| } |
| |
| /** |
| * Retrieve a collation key for the specified string. The key can be compared |
| * with other collation keys using a bitwise comparison (e.g. memcmp) to find |
| * the ordering of their respective source strings. This is handy when doing a |
| * sort, where each sort key must be compared many times. |
| * |
| * The basic algorithm here is to find all of the collation elements for each |
| * character in the source string, convert them to an ASCII representation, and |
| * put them into the collation key. But it's trickier than that. Each |
| * collation element in a string has three components: primary ('A' vs 'B'), |
| * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference |
| * at the end of a string takes precedence over a secondary or tertiary |
| * difference earlier in the string. |
| * |
| * To account for this, we put all of the primary orders at the beginning of |
| * the string, followed by the secondary and tertiary orders. Each set of |
| * orders is terminated by nulls so that a key for a string which is a initial |
| * substring of another key will compare less without any special case. |
| * |
| * Here's a hypothetical example, with the collation element represented as a |
| * three-digit number, one digit for primary, one for secondary, etc. |
| * |
| * String: A a B \u00C9 |
| * Collation Elements: 101 100 201 511 |
| * Collation Key: 1125<null>0001<null>1011<null> |
| * |
| * To make things even trickier, secondary differences (accent marks) are |
| * compared starting at the *end* of the string in languages with French |
| * secondary ordering. But when comparing the accent marks on a single base |
| * character, they are compared from the beginning. To handle this, we reverse |
| * all of the accents that belong to each base character, then we reverse the |
| * entire string of secondary orderings at the end. |
| */ |
| CollationKey& RuleBasedCollator::getCollationKey( |
| const UnicodeString& source, |
| CollationKey& sortkey, |
| UErrorCode& status) const |
| { |
| return getCollationKey(source.getBuffer(), source.length(), sortkey, status); |
| } |
| |
| CollationKey& RuleBasedCollator::getCollationKey(const UChar* source, |
| int32_t sourceLen, |
| CollationKey& sortkey, |
| UErrorCode& status) const |
| { |
| if (U_FAILURE(status)) { |
| return sortkey.setToBogus(); |
| } |
| if (sourceLen < -1 || (source == NULL && sourceLen != 0)) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return sortkey.setToBogus(); |
| } |
| |
| if (sourceLen < 0) { |
| sourceLen = u_strlen(source); |
| } |
| if (sourceLen == 0) { |
| return sortkey.reset(); |
| } |
| |
| int32_t resultLen = ucol_getCollationKey(ucollator, source, sourceLen, sortkey, status); |
| |
| if (U_SUCCESS(status)) { |
| sortkey.setLength(resultLen); |
| } else { |
| sortkey.setToBogus(); |
| } |
| return sortkey; |
| } |
| |
| /** |
| * Return the maximum length of any expansion sequences that end with the |
| * specified comparison order. |
| * @param order a collation order returned by previous or next. |
| * @return the maximum length of any expansion seuences ending with the |
| * specified order or 1 if collation order does not occur at the end of any |
| * expansion sequence. |
| * @see CollationElementIterator#getMaxExpansion |
| */ |
| int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const |
| { |
| uint8_t result; |
| UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result); |
| return result; |
| } |
| |
| uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length, |
| UErrorCode &status) |
| { |
| return ucol_cloneRuleData(ucollator, &length, &status); |
| } |
| |
| |
| int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) |
| { |
| return ucol_cloneBinary(ucollator, buffer, capacity, &status); |
| } |
| |
| void RuleBasedCollator::setAttribute(UColAttribute attr, |
| UColAttributeValue value, |
| UErrorCode &status) |
| { |
| if (U_FAILURE(status)) |
| return; |
| checkOwned(); |
| ucol_setAttribute(ucollator, attr, value, &status); |
| } |
| |
| UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, |
| UErrorCode &status) const |
| { |
| if (U_FAILURE(status)) |
| return UCOL_DEFAULT; |
| return ucol_getAttribute(ucollator, attr, &status); |
| } |
| |
| uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) { |
| checkOwned(); |
| return ucol_setVariableTop(ucollator, varTop, len, &status); |
| } |
| |
| uint32_t RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &status) { |
| checkOwned(); |
| return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status); |
| } |
| |
| void RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &status) { |
| checkOwned(); |
| ucol_restoreVariableTop(ucollator, varTop, &status); |
| } |
| |
| uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const { |
| return ucol_getVariableTop(ucollator, &status); |
| } |
| |
| int32_t RuleBasedCollator::getSortKey(const UnicodeString& source, |
| uint8_t *result, int32_t resultLength) |
| const |
| { |
| return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength); |
| } |
| |
| int32_t RuleBasedCollator::getSortKey(const UChar *source, |
| int32_t sourceLength, uint8_t *result, |
| int32_t resultLength) const |
| { |
| return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength); |
| } |
| |
| int32_t RuleBasedCollator::getReorderCodes(int32_t *dest, |
| int32_t destCapacity, |
| UErrorCode& status) const |
| { |
| return ucol_getReorderCodes(ucollator, dest, destCapacity, &status); |
| } |
| |
| void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, |
| int32_t reorderCodesLength, |
| UErrorCode& status) |
| { |
| checkOwned(); |
| ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status); |
| } |
| |
| int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode, |
| int32_t* dest, |
| int32_t destCapacity, |
| UErrorCode& status) |
| { |
| return ucol_getEquivalentReorderCodes(reorderCode, dest, destCapacity, &status); |
| } |
| |
| /** |
| * Create a hash code for this collation. Just hash the main rule table -- that |
| * should be good enough for almost any use. |
| */ |
| int32_t RuleBasedCollator::hashCode() const |
| { |
| int32_t length; |
| const UChar *rules = ucol_getRules(ucollator, &length); |
| return ustr_hashUCharsN(rules, length); |
| } |
| |
| /** |
| * return the locale of this collator |
| */ |
| Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const { |
| const char *result = ucol_getLocaleByType(ucollator, type, &status); |
| if(result == NULL) { |
| Locale res(""); |
| res.setToBogus(); |
| return res; |
| } else { |
| return Locale(result); |
| } |
| } |
| |
| void |
| RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) { |
| checkOwned(); |
| char* rloc = uprv_strdup(requestedLocale.getName()); |
| if (rloc) { |
| char* vloc = uprv_strdup(validLocale.getName()); |
| if (vloc) { |
| char* aloc = uprv_strdup(actualLocale.getName()); |
| if (aloc) { |
| ucol_setReqValidLocales(ucollator, rloc, vloc, aloc); |
| return; |
| } |
| uprv_free(vloc); |
| } |
| uprv_free(rloc); |
| } |
| } |
| |
| // RuleBaseCollatorNew private constructor ---------------------------------- |
| |
| RuleBasedCollator::RuleBasedCollator() |
| : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) |
| { |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale, |
| UErrorCode& status) |
| : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) |
| { |
| if (U_FAILURE(status)) |
| return; |
| |
| /* |
| Try to load, in order: |
| 1. The desired locale's collation. |
| 2. A fallback of the desired locale. |
| 3. The default locale's collation. |
| 4. A fallback of the default locale. |
| 5. The default collation rules, which contains en_US collation rules. |
| |
| To reiterate, we try: |
| Specific: |
| language+country+variant |
| language+country |
| language |
| Default: |
| language+country+variant |
| language+country |
| language |
| Root: (aka DEFAULTRULES) |
| steps 1-5 are handled by resource bundle fallback mechanism. |
| however, in a very unprobable situation that no resource bundle |
| data exists, step 5 is repeated with hardcoded default rules. |
| */ |
| |
| setUCollator(desiredLocale, status); |
| |
| if (U_FAILURE(status)) |
| { |
| status = U_ZERO_ERROR; |
| |
| setUCollator(kRootLocaleName, status); |
| if (status == U_ZERO_ERROR) { |
| status = U_USING_DEFAULT_WARNING; |
| } |
| } |
| |
| if (U_SUCCESS(status)) |
| { |
| setRuleStringFromCollator(); |
| } |
| } |
| |
| void |
| RuleBasedCollator::setUCollator(const char *locale, |
| UErrorCode &status) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| if (ucollator && dataIsOwned) |
| ucol_close(ucollator); |
| ucollator = ucol_open_internal(locale, &status); |
| dataIsOwned = TRUE; |
| isWriteThroughAlias = FALSE; |
| } |
| |
| |
| void |
| RuleBasedCollator::checkOwned() { |
| if (!(dataIsOwned || isWriteThroughAlias)) { |
| UErrorCode status = U_ZERO_ERROR; |
| ucollator = ucol_safeClone(ucollator, NULL, NULL, &status); |
| setRuleStringFromCollator(); |
| dataIsOwned = TRUE; |
| isWriteThroughAlias = FALSE; |
| } |
| } |
| |
| |
| int32_t RuleBasedCollator::internalGetShortDefinitionString(const char *locale, |
| char *buffer, |
| int32_t capacity, |
| UErrorCode &status) const { |
| /* simply delegate */ |
| return ucol_getShortDefinitionString(ucollator, locale, buffer, capacity, &status); |
| } |
| |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_COLLATION */ |