| /* |
| ****************************************************************************** |
| * Copyright (C) 2001-2011, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ****************************************************************************** |
| * |
| * File ucoleitr.cpp |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 02/15/2001 synwee Modified all methods to process its own function |
| * instead of calling the equivalent c++ api (coleitr.h) |
| ******************************************************************************/ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/ucoleitr.h" |
| #include "unicode/ustring.h" |
| #include "unicode/sortkey.h" |
| #include "unicode/uobject.h" |
| #include "ucol_imp.h" |
| #include "cmemory.h" |
| |
| U_NAMESPACE_USE |
| |
| #define BUFFER_LENGTH 100 |
| |
| #define DEFAULT_BUFFER_SIZE 16 |
| #define BUFFER_GROW 8 |
| |
| #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| |
| #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) |
| |
| #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) |
| |
| #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0]) |
| |
| #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
| |
| typedef struct icu::collIterate collIterator; |
| |
| struct RCEI |
| { |
| uint32_t ce; |
| int32_t low; |
| int32_t high; |
| }; |
| |
| U_NAMESPACE_BEGIN |
| |
| struct RCEBuffer |
| { |
| RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; |
| RCEI *buffer; |
| int32_t bufferIndex; |
| int32_t bufferSize; |
| |
| RCEBuffer(); |
| ~RCEBuffer(); |
| |
| UBool empty() const; |
| void put(uint32_t ce, int32_t ixLow, int32_t ixHigh); |
| const RCEI *get(); |
| }; |
| |
| RCEBuffer::RCEBuffer() |
| { |
| buffer = defaultBuffer; |
| bufferIndex = 0; |
| bufferSize = DEFAULT_BUFFER_SIZE; |
| } |
| |
| RCEBuffer::~RCEBuffer() |
| { |
| if (buffer != defaultBuffer) { |
| DELETE_ARRAY(buffer); |
| } |
| } |
| |
| UBool RCEBuffer::empty() const |
| { |
| return bufferIndex <= 0; |
| } |
| |
| void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh) |
| { |
| if (bufferIndex >= bufferSize) { |
| RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); |
| |
| ARRAY_COPY(newBuffer, buffer, bufferSize); |
| |
| if (buffer != defaultBuffer) { |
| DELETE_ARRAY(buffer); |
| } |
| |
| buffer = newBuffer; |
| bufferSize += BUFFER_GROW; |
| } |
| |
| buffer[bufferIndex].ce = ce; |
| buffer[bufferIndex].low = ixLow; |
| buffer[bufferIndex].high = ixHigh; |
| |
| bufferIndex += 1; |
| } |
| |
| const RCEI *RCEBuffer::get() |
| { |
| if (bufferIndex > 0) { |
| return &buffer[--bufferIndex]; |
| } |
| |
| return NULL; |
| } |
| |
| struct PCEI |
| { |
| uint64_t ce; |
| int32_t low; |
| int32_t high; |
| }; |
| |
| struct PCEBuffer |
| { |
| PCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; |
| PCEI *buffer; |
| int32_t bufferIndex; |
| int32_t bufferSize; |
| |
| PCEBuffer(); |
| ~PCEBuffer(); |
| |
| void reset(); |
| UBool empty() const; |
| void put(uint64_t ce, int32_t ixLow, int32_t ixHigh); |
| const PCEI *get(); |
| }; |
| |
| PCEBuffer::PCEBuffer() |
| { |
| buffer = defaultBuffer; |
| bufferIndex = 0; |
| bufferSize = DEFAULT_BUFFER_SIZE; |
| } |
| |
| PCEBuffer::~PCEBuffer() |
| { |
| if (buffer != defaultBuffer) { |
| DELETE_ARRAY(buffer); |
| } |
| } |
| |
| void PCEBuffer::reset() |
| { |
| bufferIndex = 0; |
| } |
| |
| UBool PCEBuffer::empty() const |
| { |
| return bufferIndex <= 0; |
| } |
| |
| void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh) |
| { |
| if (bufferIndex >= bufferSize) { |
| PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); |
| |
| ARRAY_COPY(newBuffer, buffer, bufferSize); |
| |
| if (buffer != defaultBuffer) { |
| DELETE_ARRAY(buffer); |
| } |
| |
| buffer = newBuffer; |
| bufferSize += BUFFER_GROW; |
| } |
| |
| buffer[bufferIndex].ce = ce; |
| buffer[bufferIndex].low = ixLow; |
| buffer[bufferIndex].high = ixHigh; |
| |
| bufferIndex += 1; |
| } |
| |
| const PCEI *PCEBuffer::get() |
| { |
| if (bufferIndex > 0) { |
| return &buffer[--bufferIndex]; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * This inherits from UObject so that |
| * it can be allocated by new and the |
| * constructor for PCEBuffer is called. |
| */ |
| struct UCollationPCE : public UObject |
| { |
| PCEBuffer pceBuffer; |
| UCollationStrength strength; |
| UBool toShift; |
| UBool isShifted; |
| uint32_t variableTop; |
| |
| UCollationPCE(UCollationElements *elems); |
| ~UCollationPCE(); |
| |
| void init(const UCollator *coll); |
| |
| virtual UClassID getDynamicClassID() const; |
| static UClassID getStaticClassID(); |
| }; |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE) |
| |
| UCollationPCE::UCollationPCE(UCollationElements *elems) |
| { |
| init(elems->iteratordata_.coll); |
| } |
| |
| void UCollationPCE::init(const UCollator *coll) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| strength = ucol_getStrength(coll); |
| toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; |
| isShifted = FALSE; |
| variableTop = coll->variableTopValue << 16; |
| } |
| |
| UCollationPCE::~UCollationPCE() |
| { |
| // nothing to do |
| } |
| |
| |
| U_NAMESPACE_END |
| |
| |
| inline uint64_t processCE(UCollationElements *elems, uint32_t ce) |
| { |
| uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; |
| |
| // This is clean, but somewhat slow... |
| // We could apply the mask to ce and then |
| // just get all three orders... |
| switch(elems->pce->strength) { |
| default: |
| tertiary = ucol_tertiaryOrder(ce); |
| /* note fall-through */ |
| |
| case UCOL_SECONDARY: |
| secondary = ucol_secondaryOrder(ce); |
| /* note fall-through */ |
| |
| case UCOL_PRIMARY: |
| primary = ucol_primaryOrder(ce); |
| } |
| |
| // **** This should probably handle continuations too. **** |
| // **** That means that we need 24 bits for the primary **** |
| // **** instead of the 16 that we're currently using. **** |
| // **** So we can lay out the 64 bits as: 24.12.12.16. **** |
| // **** Another complication with continuations is that **** |
| // **** the *second* CE is marked as a continuation, so **** |
| // **** we always have to peek ahead to know how long **** |
| // **** the primary is... **** |
| if ((elems->pce->toShift && elems->pce->variableTop > ce && primary != 0) |
| || (elems->pce->isShifted && primary == 0)) { |
| |
| if (primary == 0) { |
| return UCOL_IGNORABLE; |
| } |
| |
| if (elems->pce->strength >= UCOL_QUATERNARY) { |
| quaternary = primary; |
| } |
| |
| primary = secondary = tertiary = 0; |
| elems->pce->isShifted = TRUE; |
| } else { |
| if (elems->pce->strength >= UCOL_QUATERNARY) { |
| quaternary = 0xFFFF; |
| } |
| |
| elems->pce->isShifted = FALSE; |
| } |
| |
| return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| uprv_init_pce(const UCollationElements *elems) |
| { |
| if (elems->pce != NULL) { |
| elems->pce->init(elems->iteratordata_.coll); |
| } |
| } |
| |
| |
| |
| /* public methods ---------------------------------------------------- */ |
| |
| U_CAPI UCollationElements* U_EXPORT2 |
| ucol_openElements(const UCollator *coll, |
| const UChar *text, |
| int32_t textLength, |
| UErrorCode *status) |
| { |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| |
| UCollationElements *result = new UCollationElements; |
| if (result == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| |
| result->reset_ = TRUE; |
| result->isWritable = FALSE; |
| result->pce = NULL; |
| |
| if (text == NULL) { |
| textLength = 0; |
| } |
| uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status); |
| |
| return result; |
| } |
| |
| |
| U_CAPI void U_EXPORT2 |
| ucol_closeElements(UCollationElements *elems) |
| { |
| if (elems != NULL) { |
| collIterate *ci = &elems->iteratordata_; |
| |
| if (ci->extendCEs) { |
| uprv_free(ci->extendCEs); |
| } |
| |
| if (ci->offsetBuffer) { |
| uprv_free(ci->offsetBuffer); |
| } |
| |
| if (elems->isWritable && elems->iteratordata_.string != NULL) |
| { |
| uprv_free((UChar *)elems->iteratordata_.string); |
| } |
| |
| if (elems->pce != NULL) { |
| delete elems->pce; |
| } |
| |
| delete elems; |
| } |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucol_reset(UCollationElements *elems) |
| { |
| collIterate *ci = &(elems->iteratordata_); |
| elems->reset_ = TRUE; |
| ci->pos = ci->string; |
| if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) { |
| ci->endp = ci->string + u_strlen(ci->string); |
| } |
| ci->CEpos = ci->toReturn = ci->CEs; |
| ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN; |
| if (ci->coll->normalizationMode == UCOL_ON) { |
| ci->flags |= UCOL_ITER_NORM; |
| } |
| |
| ci->writableBuffer.remove(); |
| ci->fcdPosition = NULL; |
| |
| //ci->offsetReturn = ci->offsetStore = NULL; |
| ci->offsetRepeatCount = ci->offsetRepeatValue = 0; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status) |
| { |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| if (elems == NULL) { |
| *status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT; |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_next(UCollationElements *elems, |
| UErrorCode *status) |
| { |
| int32_t result; |
| if (U_FAILURE(*status)) { |
| return UCOL_NULLORDER; |
| } |
| |
| elems->reset_ = FALSE; |
| |
| result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll, |
| &elems->iteratordata_, |
| status); |
| |
| if (result == UCOL_NO_MORE_CES) { |
| result = UCOL_NULLORDER; |
| } |
| return result; |
| } |
| |
| U_CAPI int64_t U_EXPORT2 |
| ucol_nextProcessed(UCollationElements *elems, |
| int32_t *ixLow, |
| int32_t *ixHigh, |
| UErrorCode *status) |
| { |
| const UCollator *coll = elems->iteratordata_.coll; |
| int64_t result = UCOL_IGNORABLE; |
| uint32_t low = 0, high = 0; |
| |
| if (U_FAILURE(*status)) { |
| return UCOL_PROCESSED_NULLORDER; |
| } |
| |
| if (elems->pce == NULL) { |
| elems->pce = new UCollationPCE(elems); |
| } else { |
| elems->pce->pceBuffer.reset(); |
| } |
| |
| elems->reset_ = FALSE; |
| |
| do { |
| low = ucol_getOffset(elems); |
| uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status); |
| high = ucol_getOffset(elems); |
| |
| if (ce == UCOL_NO_MORE_CES) { |
| result = UCOL_PROCESSED_NULLORDER; |
| break; |
| } |
| |
| result = processCE(elems, ce); |
| } while (result == UCOL_IGNORABLE); |
| |
| if (ixLow != NULL) { |
| *ixLow = low; |
| } |
| |
| if (ixHigh != NULL) { |
| *ixHigh = high; |
| } |
| |
| return result; |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_previous(UCollationElements *elems, |
| UErrorCode *status) |
| { |
| if(U_FAILURE(*status)) { |
| return UCOL_NULLORDER; |
| } |
| else |
| { |
| int32_t result; |
| |
| if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) { |
| if (elems->iteratordata_.endp == NULL) { |
| elems->iteratordata_.endp = elems->iteratordata_.string + |
| u_strlen(elems->iteratordata_.string); |
| elems->iteratordata_.flags |= UCOL_ITER_HASLEN; |
| } |
| elems->iteratordata_.pos = elems->iteratordata_.endp; |
| elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; |
| } |
| |
| elems->reset_ = FALSE; |
| |
| result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll, |
| &(elems->iteratordata_), |
| status); |
| |
| if (result == UCOL_NO_MORE_CES) { |
| result = UCOL_NULLORDER; |
| } |
| |
| return result; |
| } |
| } |
| |
| U_CAPI int64_t U_EXPORT2 |
| ucol_previousProcessed(UCollationElements *elems, |
| int32_t *ixLow, |
| int32_t *ixHigh, |
| UErrorCode *status) |
| { |
| const UCollator *coll = elems->iteratordata_.coll; |
| int64_t result = UCOL_IGNORABLE; |
| // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; |
| // UCollationStrength strength = ucol_getStrength(coll); |
| // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; |
| // uint32_t variableTop = coll->variableTopValue; |
| int32_t low = 0, high = 0; |
| |
| if (U_FAILURE(*status)) { |
| return UCOL_PROCESSED_NULLORDER; |
| } |
| |
| if (elems->reset_ && |
| (elems->iteratordata_.pos == elems->iteratordata_.string)) { |
| if (elems->iteratordata_.endp == NULL) { |
| elems->iteratordata_.endp = elems->iteratordata_.string + |
| u_strlen(elems->iteratordata_.string); |
| elems->iteratordata_.flags |= UCOL_ITER_HASLEN; |
| } |
| |
| elems->iteratordata_.pos = elems->iteratordata_.endp; |
| elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; |
| } |
| |
| if (elems->pce == NULL) { |
| elems->pce = new UCollationPCE(elems); |
| } else { |
| //elems->pce->pceBuffer.reset(); |
| } |
| |
| elems->reset_ = FALSE; |
| |
| while (elems->pce->pceBuffer.empty()) { |
| // buffer raw CEs up to non-ignorable primary |
| RCEBuffer rceb; |
| uint32_t ce; |
| |
| // **** do we need to reset rceb, or will it always be empty at this point **** |
| do { |
| high = ucol_getOffset(elems); |
| ce = ucol_getPrevCE(coll, &elems->iteratordata_, status); |
| low = ucol_getOffset(elems); |
| |
| if (ce == UCOL_NO_MORE_CES) { |
| if (! rceb.empty()) { |
| break; |
| } |
| |
| goto finish; |
| } |
| |
| rceb.put(ce, low, high); |
| } while ((ce & UCOL_PRIMARYMASK) == 0); |
| |
| // process the raw CEs |
| while (! rceb.empty()) { |
| const RCEI *rcei = rceb.get(); |
| |
| result = processCE(elems, rcei->ce); |
| |
| if (result != UCOL_IGNORABLE) { |
| elems->pce->pceBuffer.put(result, rcei->low, rcei->high); |
| } |
| } |
| } |
| |
| finish: |
| if (elems->pce->pceBuffer.empty()) { |
| // **** Is -1 the right value for ixLow, ixHigh? **** |
| if (ixLow != NULL) { |
| *ixLow = -1; |
| } |
| |
| if (ixHigh != NULL) { |
| *ixHigh = -1 |
| ; |
| } |
| return UCOL_PROCESSED_NULLORDER; |
| } |
| |
| const PCEI *pcei = elems->pce->pceBuffer.get(); |
| |
| if (ixLow != NULL) { |
| *ixLow = pcei->low; |
| } |
| |
| if (ixHigh != NULL) { |
| *ixHigh = pcei->high; |
| } |
| |
| return pcei->ce; |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_getMaxExpansion(const UCollationElements *elems, |
| int32_t order) |
| { |
| uint8_t result; |
| |
| #if 0 |
| UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result); |
| #else |
| const UCollator *coll = elems->iteratordata_.coll; |
| const uint32_t *start; |
| const uint32_t *limit; |
| const uint32_t *mid; |
| uint32_t strengthMask = 0; |
| uint32_t mOrder = (uint32_t) order; |
| |
| switch (coll->strength) |
| { |
| default: |
| strengthMask |= UCOL_TERTIARYORDERMASK; |
| /* fall through */ |
| |
| case UCOL_SECONDARY: |
| strengthMask |= UCOL_SECONDARYORDERMASK; |
| /* fall through */ |
| |
| case UCOL_PRIMARY: |
| strengthMask |= UCOL_PRIMARYORDERMASK; |
| } |
| |
| mOrder &= strengthMask; |
| start = (coll)->endExpansionCE; |
| limit = (coll)->lastEndExpansionCE; |
| |
| while (start < limit - 1) { |
| mid = start + ((limit - start) >> 1); |
| if (mOrder <= (*mid & strengthMask)) { |
| limit = mid; |
| } else { |
| start = mid; |
| } |
| } |
| |
| // FIXME: with a masked search, there might be more than one hit, |
| // so we need to look forward and backward from the match to find all |
| // of the hits... |
| if ((*start & strengthMask) == mOrder) { |
| result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE)); |
| } else if ((*limit & strengthMask) == mOrder) { |
| result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); |
| } else if ((mOrder & 0xFFFF) == 0x00C0) { |
| result = 2; |
| } else { |
| result = 1; |
| } |
| #endif |
| |
| return result; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucol_setText( UCollationElements *elems, |
| const UChar *text, |
| int32_t textLength, |
| UErrorCode *status) |
| { |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| if (elems->isWritable && elems->iteratordata_.string != NULL) |
| { |
| uprv_free((UChar *)elems->iteratordata_.string); |
| } |
| |
| if (text == NULL) { |
| textLength = 0; |
| } |
| |
| elems->isWritable = FALSE; |
| |
| /* free offset buffer to avoid memory leak before initializing. */ |
| ucol_freeOffsetBuffer(&(elems->iteratordata_)); |
| /* Ensure that previously allocated extendCEs is freed before setting to NULL. */ |
| if (elems->iteratordata_.extendCEs != NULL) { |
| uprv_free(elems->iteratordata_.extendCEs); |
| } |
| uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, |
| &elems->iteratordata_, status); |
| |
| elems->reset_ = TRUE; |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_getOffset(const UCollationElements *elems) |
| { |
| const collIterate *ci = &(elems->iteratordata_); |
| |
| if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) { |
| return ci->offsetRepeatValue; |
| } |
| |
| if (ci->offsetReturn != NULL) { |
| return *ci->offsetReturn; |
| } |
| |
| // while processing characters in normalization buffer getOffset will |
| // return the next non-normalized character. |
| // should be inline with the old implementation since the old codes uses |
| // nextDecomp in normalizer which also decomposes the string till the |
| // first base character is found. |
| if (ci->flags & UCOL_ITER_INNORMBUF) { |
| if (ci->fcdPosition == NULL) { |
| return 0; |
| } |
| return (int32_t)(ci->fcdPosition - ci->string); |
| } |
| else { |
| return (int32_t)(ci->pos - ci->string); |
| } |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucol_setOffset(UCollationElements *elems, |
| int32_t offset, |
| UErrorCode *status) |
| { |
| if (U_FAILURE(*status)) { |
| return; |
| } |
| |
| // this methods will clean up any use of the writable buffer and points to |
| // the original string |
| collIterate *ci = &(elems->iteratordata_); |
| ci->pos = ci->string + offset; |
| ci->CEpos = ci->toReturn = ci->CEs; |
| if (ci->flags & UCOL_ITER_INNORMBUF) { |
| ci->flags = ci->origFlags; |
| } |
| if ((ci->flags & UCOL_ITER_HASLEN) == 0) { |
| ci->endp = ci->string + u_strlen(ci->string); |
| ci->flags |= UCOL_ITER_HASLEN; |
| } |
| ci->fcdPosition = NULL; |
| elems->reset_ = FALSE; |
| |
| ci->offsetReturn = NULL; |
| ci->offsetStore = ci->offsetBuffer; |
| ci->offsetRepeatCount = ci->offsetRepeatValue = 0; |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_primaryOrder (int32_t order) |
| { |
| order &= UCOL_PRIMARYMASK; |
| return (order >> UCOL_PRIMARYORDERSHIFT); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_secondaryOrder (int32_t order) |
| { |
| order &= UCOL_SECONDARYMASK; |
| return (order >> UCOL_SECONDARYORDERSHIFT); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucol_tertiaryOrder (int32_t order) |
| { |
| return (order & UCOL_TERTIARYMASK); |
| } |
| |
| |
| void ucol_freeOffsetBuffer(collIterate *s) { |
| if (s != NULL && s->offsetBuffer != NULL) { |
| uprv_free(s->offsetBuffer); |
| s->offsetBuffer = NULL; |
| s->offsetBufferSize = 0; |
| } |
| } |
| |
| #endif /* #if !UCONFIG_NO_COLLATION */ |