| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-2014, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| |
| /* |
| * File coleitr.cpp |
| * |
| * Created by: Helena Shih |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * |
| * 6/23/97 helena Adding comments to make code more readable. |
| * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java |
| * 12/10/99 aliu Ported Thai collation support from Java. |
| * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) |
| * 02/19/01 swquek Removed CollationElementIterator() since it is |
| * private constructor and no calls are made to it |
| * 2012-2014 markus Rewritten in C++ again. |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #if defined(STARBOARD) |
| #include "starboard/client_porting/poem/assert_poem.h" |
| #include "starboard/client_porting/poem/string_poem.h" |
| #endif // defined(STARBOARD) |
| #include "unicode/chariter.h" |
| #include "unicode/coleitr.h" |
| #include "unicode/tblcoll.h" |
| #include "unicode/ustring.h" |
| #include "cmemory.h" |
| #include "collation.h" |
| #include "collationdata.h" |
| #include "collationiterator.h" |
| #include "collationsets.h" |
| #include "collationtailoring.h" |
| #include "uassert.h" |
| #include "uhash.h" |
| #include "utf16collationiterator.h" |
| #include "uvectr32.h" |
| |
| /* Constants --------------------------------------------------------------- */ |
| |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) |
| |
| /* CollationElementIterator public constructor/destructor ------------------ */ |
| |
| CollationElementIterator::CollationElementIterator( |
| const CollationElementIterator& other) |
| : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { |
| *this = other; |
| } |
| |
| CollationElementIterator::~CollationElementIterator() |
| { |
| delete iter_; |
| delete offsets_; |
| } |
| |
| /* CollationElementIterator public methods --------------------------------- */ |
| |
| namespace { |
| |
| uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { |
| return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); |
| } |
| uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { |
| return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); |
| } |
| UBool ceNeedsTwoParts(int64_t ce) { |
| return (ce & INT64_C(0xffff00ff003f)) != 0; |
| } |
| |
| } // namespace |
| |
| int32_t CollationElementIterator::getOffset() const |
| { |
| if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { |
| // CollationIterator::previousCE() decrements the CEs length |
| // while it pops CEs from its internal buffer. |
| int32_t i = iter_->getCEsLength(); |
| if (otherHalf_ != 0) { |
| // Return the trailing CE offset while we are in the middle of a 64-bit CE. |
| ++i; |
| } |
| U_ASSERT(i < offsets_->size()); |
| return offsets_->elementAti(i); |
| } |
| return iter_->getOffset(); |
| } |
| |
| /** |
| * Get the ordering priority of the next character in the string. |
| * @return the next character's ordering. Returns NULLORDER if an error has |
| * occured or if the end of string has been reached |
| */ |
| int32_t CollationElementIterator::next(UErrorCode& status) |
| { |
| if (U_FAILURE(status)) { return NULLORDER; } |
| if (dir_ > 1) { |
| // Continue forward iteration. Test this first. |
| if (otherHalf_ != 0) { |
| uint32_t oh = otherHalf_; |
| otherHalf_ = 0; |
| return oh; |
| } |
| } else if (dir_ == 1) { |
| // next() after setOffset() |
| dir_ = 2; |
| } else if (dir_ == 0) { |
| // The iter_ is already reset to the start of the text. |
| dir_ = 2; |
| } else /* dir_ < 0 */ { |
| // illegal change of direction |
| status = U_INVALID_STATE_ERROR; |
| return NULLORDER; |
| } |
| // No need to keep all CEs in the buffer when we iterate. |
| iter_->clearCEsIfNoneRemaining(); |
| int64_t ce = iter_->nextCE(status); |
| if (ce == Collation::NO_CE) { return NULLORDER; } |
| // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. |
| uint32_t p = (uint32_t)(ce >> 32); |
| uint32_t lower32 = (uint32_t)ce; |
| uint32_t firstHalf = getFirstHalf(p, lower32); |
| uint32_t secondHalf = getSecondHalf(p, lower32); |
| if (secondHalf != 0) { |
| otherHalf_ = secondHalf | 0xc0; // continuation CE |
| } |
| return firstHalf; |
| } |
| |
| UBool CollationElementIterator::operator!=( |
| const CollationElementIterator& other) const |
| { |
| return !(*this == other); |
| } |
| |
| UBool CollationElementIterator::operator==( |
| const CollationElementIterator& that) const |
| { |
| if (this == &that) { |
| return TRUE; |
| } |
| |
| return |
| (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && |
| otherHalf_ == that.otherHalf_ && |
| normalizeDir() == that.normalizeDir() && |
| string_ == that.string_ && |
| *iter_ == *that.iter_; |
| } |
| |
| /** |
| * Get the ordering priority of the previous collation element in the string. |
| * @param status the error code status. |
| * @return the previous element's ordering. Returns NULLORDER if an error has |
| * occured or if the start of string has been reached. |
| */ |
| int32_t CollationElementIterator::previous(UErrorCode& status) |
| { |
| if (U_FAILURE(status)) { return NULLORDER; } |
| if (dir_ < 0) { |
| // Continue backwards iteration. Test this first. |
| if (otherHalf_ != 0) { |
| uint32_t oh = otherHalf_; |
| otherHalf_ = 0; |
| return oh; |
| } |
| } else if (dir_ == 0) { |
| iter_->resetToOffset(string_.length()); |
| dir_ = -1; |
| } else if (dir_ == 1) { |
| // previous() after setOffset() |
| dir_ = -1; |
| } else /* dir_ > 1 */ { |
| // illegal change of direction |
| status = U_INVALID_STATE_ERROR; |
| return NULLORDER; |
| } |
| if (offsets_ == NULL) { |
| offsets_ = new UVector32(status); |
| if (offsets_ == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return NULLORDER; |
| } |
| } |
| // If we already have expansion CEs, then we also have offsets. |
| // Otherwise remember the trailing offset in case we need to |
| // write offsets for an artificial expansion. |
| int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; |
| int64_t ce = iter_->previousCE(*offsets_, status); |
| if (ce == Collation::NO_CE) { return NULLORDER; } |
| // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. |
| uint32_t p = (uint32_t)(ce >> 32); |
| uint32_t lower32 = (uint32_t)ce; |
| uint32_t firstHalf = getFirstHalf(p, lower32); |
| uint32_t secondHalf = getSecondHalf(p, lower32); |
| if (secondHalf != 0) { |
| if (offsets_->isEmpty()) { |
| // When we convert a single 64-bit CE into two 32-bit CEs, |
| // we need to make this artificial expansion behave like a normal expansion. |
| // See CollationIterator::previousCE(). |
| offsets_->addElement(iter_->getOffset(), status); |
| offsets_->addElement(limitOffset, status); |
| } |
| otherHalf_ = firstHalf; |
| return secondHalf | 0xc0; // continuation CE |
| } |
| return firstHalf; |
| } |
| |
| /** |
| * Resets the cursor to the beginning of the string. |
| */ |
| void CollationElementIterator::reset() |
| { |
| iter_ ->resetToOffset(0); |
| otherHalf_ = 0; |
| dir_ = 0; |
| } |
| |
| void CollationElementIterator::setOffset(int32_t newOffset, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) { return; } |
| if (0 < newOffset && newOffset < string_.length()) { |
| int32_t offset = newOffset; |
| do { |
| UChar c = string_.charAt(offset); |
| if (!rbc_->isUnsafe(c) || |
| (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { |
| break; |
| } |
| // Back up to before this unsafe character. |
| --offset; |
| } while (offset > 0); |
| if (offset < newOffset) { |
| // We might have backed up more than necessary. |
| // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, |
| // but for text "chu" setOffset(2) should remain at 2 |
| // although we initially back up to offset 0. |
| // Find the last safe offset no greater than newOffset by iterating forward. |
| int32_t lastSafeOffset = offset; |
| do { |
| iter_->resetToOffset(lastSafeOffset); |
| do { |
| iter_->nextCE(status); |
| if (U_FAILURE(status)) { return; } |
| } while ((offset = iter_->getOffset()) == lastSafeOffset); |
| if (offset <= newOffset) { |
| lastSafeOffset = offset; |
| } |
| } while (offset < newOffset); |
| newOffset = lastSafeOffset; |
| } |
| } |
| iter_->resetToOffset(newOffset); |
| otherHalf_ = 0; |
| dir_ = 1; |
| } |
| |
| /** |
| * Sets the source to the new source string. |
| */ |
| void CollationElementIterator::setText(const UnicodeString& source, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| string_ = source; |
| const UChar *s = string_.getBuffer(); |
| CollationIterator *newIter; |
| UBool numeric = rbc_->settings->isNumeric(); |
| if (rbc_->settings->dontCheckFCD()) { |
| newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); |
| } else { |
| newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); |
| } |
| if (newIter == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| delete iter_; |
| iter_ = newIter; |
| otherHalf_ = 0; |
| dir_ = 0; |
| } |
| |
| // Sets the source to the new character iterator. |
| void CollationElementIterator::setText(CharacterIterator& source, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) |
| return; |
| |
| source.getText(string_); |
| setText(string_, status); |
| } |
| |
| int32_t CollationElementIterator::strengthOrder(int32_t order) const |
| { |
| UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); |
| // Mask off the unwanted differences. |
| if (s == UCOL_PRIMARY) { |
| order &= 0xffff0000; |
| } |
| else if (s == UCOL_SECONDARY) { |
| order &= 0xffffff00; |
| } |
| |
| return order; |
| } |
| |
| /* CollationElementIterator private constructors/destructors --------------- */ |
| |
| /** |
| * This is the "real" constructor for this class; it constructs an iterator |
| * over the source text using the specified collator |
| */ |
| CollationElementIterator::CollationElementIterator( |
| const UnicodeString &source, |
| const RuleBasedCollator *coll, |
| UErrorCode &status) |
| : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
| setText(source, status); |
| } |
| |
| /** |
| * This is the "real" constructor for this class; it constructs an iterator over |
| * the source text using the specified collator |
| */ |
| CollationElementIterator::CollationElementIterator( |
| const CharacterIterator &source, |
| const RuleBasedCollator *coll, |
| UErrorCode &status) |
| : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
| // We only call source.getText() which should be const anyway. |
| setText(const_cast<CharacterIterator &>(source), status); |
| } |
| |
| /* CollationElementIterator private methods -------------------------------- */ |
| |
| const CollationElementIterator& CollationElementIterator::operator=( |
| const CollationElementIterator& other) |
| { |
| if (this == &other) { |
| return *this; |
| } |
| |
| CollationIterator *newIter; |
| const FCDUTF16CollationIterator *otherFCDIter = |
| dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); |
| if(otherFCDIter != NULL) { |
| newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); |
| } else { |
| const UTF16CollationIterator *otherIter = |
| dynamic_cast<const UTF16CollationIterator *>(other.iter_); |
| if(otherIter != NULL) { |
| newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); |
| } else { |
| newIter = NULL; |
| } |
| } |
| if(newIter != NULL) { |
| delete iter_; |
| iter_ = newIter; |
| rbc_ = other.rbc_; |
| otherHalf_ = other.otherHalf_; |
| dir_ = other.dir_; |
| |
| string_ = other.string_; |
| } |
| if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| if(offsets_ == NULL) { |
| offsets_ = new UVector32(other.offsets_->size(), errorCode); |
| } |
| if(offsets_ != NULL) { |
| offsets_->assign(*other.offsets_, errorCode); |
| } |
| } |
| return *this; |
| } |
| |
| namespace { |
| |
| class MaxExpSink : public ContractionsAndExpansions::CESink { |
| public: |
| MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} |
| virtual ~MaxExpSink(); |
| virtual void handleCE(int64_t /*ce*/) {} |
| virtual void handleExpansion(const int64_t ces[], int32_t length) { |
| if (length <= 1) { |
| // We do not need to add single CEs into the map. |
| return; |
| } |
| int32_t count = 0; // number of CE "halves" |
| for (int32_t i = 0; i < length; ++i) { |
| count += ceNeedsTwoParts(ces[i]) ? 2 : 1; |
| } |
| // last "half" of the last CE |
| int64_t ce = ces[length - 1]; |
| uint32_t p = (uint32_t)(ce >> 32); |
| uint32_t lower32 = (uint32_t)ce; |
| uint32_t lastHalf = getSecondHalf(p, lower32); |
| if (lastHalf == 0) { |
| lastHalf = getFirstHalf(p, lower32); |
| U_ASSERT(lastHalf != 0); |
| } else { |
| lastHalf |= 0xc0; // old-style continuation CE |
| } |
| if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { |
| uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); |
| } |
| } |
| |
| private: |
| UHashtable *maxExpansions; |
| UErrorCode &errorCode; |
| }; |
| |
| MaxExpSink::~MaxExpSink() {} |
| |
| } // namespace |
| |
| UHashtable * |
| CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { return NULL; } |
| UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, |
| uhash_compareLong, &errorCode); |
| if (U_FAILURE(errorCode)) { return NULL; } |
| MaxExpSink sink(maxExpansions, errorCode); |
| ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); |
| if (U_FAILURE(errorCode)) { |
| uhash_close(maxExpansions); |
| return NULL; |
| } |
| return maxExpansions; |
| } |
| |
| int32_t |
| CollationElementIterator::getMaxExpansion(int32_t order) const { |
| return getMaxExpansion(rbc_->tailoring->maxExpansions, order); |
| } |
| |
| int32_t |
| CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { |
| if (order == 0) { return 1; } |
| int32_t max; |
| if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { |
| return max; |
| } |
| if ((order & 0xc0) == 0xc0) { |
| // old-style continuation CE |
| return 2; |
| } else { |
| return 1; |
| } |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_COLLATION */ |