| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2013-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * collationsets.cpp |
| * |
| * created on: 2013feb09 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #if defined(STARBOARD) |
| #include "starboard/client_porting/poem/assert_poem.h" |
| #include "starboard/client_porting/poem/string_poem.h" |
| #endif // defined(STARBOARD) |
| #include "unicode/ucharstrie.h" |
| #include "unicode/uniset.h" |
| #include "unicode/unistr.h" |
| #include "unicode/ustringtrie.h" |
| #include "collation.h" |
| #include "collationdata.h" |
| #include "collationsets.h" |
| #include "normalizer2impl.h" |
| #include "uassert.h" |
| #include "utf16collationiterator.h" |
| #include "utrie2.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| U_CDECL_BEGIN |
| |
| static UBool U_CALLCONV |
| enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { |
| if(ce32 == Collation::FALLBACK_CE32) { |
| return TRUE; // fallback to base, not tailored |
| } |
| TailoredSet *ts = (TailoredSet *)context; |
| return ts->handleCE32(start, end, ce32); |
| } |
| |
| U_CDECL_END |
| |
| void |
| TailoredSet::forData(const CollationData *d, UErrorCode &ec) { |
| if(U_FAILURE(ec)) { return; } |
| errorCode = ec; // Preserve info & warning codes. |
| data = d; |
| baseData = d->base; |
| U_ASSERT(baseData != NULL); |
| utrie2_enum(data->trie, NULL, enumTailoredRange, this); |
| ec = errorCode; |
| } |
| |
| UBool |
| TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { |
| U_ASSERT(ce32 != Collation::FALLBACK_CE32); |
| if(Collation::isSpecialCE32(ce32)) { |
| ce32 = data->getIndirectCE32(ce32); |
| if(ce32 == Collation::FALLBACK_CE32) { |
| return U_SUCCESS(errorCode); |
| } |
| } |
| do { |
| uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start)); |
| // Do not just continue if ce32 == baseCE32 because |
| // contractions and expansions in different data objects |
| // normally differ even if they have the same data offsets. |
| if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) { |
| // fastpath |
| if(ce32 != baseCE32) { |
| tailored->add(start); |
| } |
| } else { |
| compare(start, ce32, baseCE32); |
| } |
| } while(++start <= end); |
| return U_SUCCESS(errorCode); |
| } |
| |
| void |
| TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) { |
| if(Collation::isPrefixCE32(ce32)) { |
| const UChar *p = data->contexts + Collation::indexFromCE32(ce32); |
| ce32 = data->getFinalCE32(CollationData::readCE32(p)); |
| if(Collation::isPrefixCE32(baseCE32)) { |
| const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); |
| baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); |
| comparePrefixes(c, p + 2, q + 2); |
| } else { |
| addPrefixes(data, c, p + 2); |
| } |
| } else if(Collation::isPrefixCE32(baseCE32)) { |
| const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); |
| baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); |
| addPrefixes(baseData, c, q + 2); |
| } |
| |
| if(Collation::isContractionCE32(ce32)) { |
| const UChar *p = data->contexts + Collation::indexFromCE32(ce32); |
| if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { |
| ce32 = Collation::NO_CE32; |
| } else { |
| ce32 = data->getFinalCE32(CollationData::readCE32(p)); |
| } |
| if(Collation::isContractionCE32(baseCE32)) { |
| const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); |
| if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { |
| baseCE32 = Collation::NO_CE32; |
| } else { |
| baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); |
| } |
| compareContractions(c, p + 2, q + 2); |
| } else { |
| addContractions(c, p + 2); |
| } |
| } else if(Collation::isContractionCE32(baseCE32)) { |
| const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); |
| baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); |
| addContractions(c, q + 2); |
| } |
| |
| int32_t tag; |
| if(Collation::isSpecialCE32(ce32)) { |
| tag = Collation::tagFromCE32(ce32); |
| U_ASSERT(tag != Collation::PREFIX_TAG); |
| U_ASSERT(tag != Collation::CONTRACTION_TAG); |
| // Currently, the tailoring data builder does not write offset tags. |
| // They might be useful for saving space, |
| // but they would complicate the builder, |
| // and in tailorings we assume that performance of tailored characters is more important. |
| U_ASSERT(tag != Collation::OFFSET_TAG); |
| } else { |
| tag = -1; |
| } |
| int32_t baseTag; |
| if(Collation::isSpecialCE32(baseCE32)) { |
| baseTag = Collation::tagFromCE32(baseCE32); |
| U_ASSERT(baseTag != Collation::PREFIX_TAG); |
| U_ASSERT(baseTag != Collation::CONTRACTION_TAG); |
| } else { |
| baseTag = -1; |
| } |
| |
| // Non-contextual mappings, expansions, etc. |
| if(baseTag == Collation::OFFSET_TAG) { |
| // We might be comparing a tailoring CE which is a copy of |
| // a base offset-tag CE, via the [optimize [set]] syntax |
| // or when a single-character mapping was copied for tailored contractions. |
| // Offset tags always result in long-primary CEs, |
| // with common secondary/tertiary weights. |
| if(!Collation::isLongPrimaryCE32(ce32)) { |
| add(c); |
| return; |
| } |
| int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)]; |
| uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); |
| if(Collation::primaryFromLongPrimaryCE32(ce32) != p) { |
| add(c); |
| return; |
| } |
| } |
| |
| if(tag != baseTag) { |
| add(c); |
| return; |
| } |
| |
| if(tag == Collation::EXPANSION32_TAG) { |
| const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); |
| int32_t length = Collation::lengthFromCE32(ce32); |
| |
| const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32); |
| int32_t baseLength = Collation::lengthFromCE32(baseCE32); |
| |
| if(length != baseLength) { |
| add(c); |
| return; |
| } |
| for(int32_t i = 0; i < length; ++i) { |
| if(ce32s[i] != baseCE32s[i]) { |
| add(c); |
| break; |
| } |
| } |
| } else if(tag == Collation::EXPANSION_TAG) { |
| const int64_t *ces = data->ces + Collation::indexFromCE32(ce32); |
| int32_t length = Collation::lengthFromCE32(ce32); |
| |
| const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32); |
| int32_t baseLength = Collation::lengthFromCE32(baseCE32); |
| |
| if(length != baseLength) { |
| add(c); |
| return; |
| } |
| for(int32_t i = 0; i < length; ++i) { |
| if(ces[i] != baseCEs[i]) { |
| add(c); |
| break; |
| } |
| } |
| } else if(tag == Collation::HANGUL_TAG) { |
| UChar jamos[3]; |
| int32_t length = Hangul::decompose(c, jamos); |
| if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) || |
| (length == 3 && tailored->contains(jamos[2]))) { |
| add(c); |
| } |
| } else if(ce32 != baseCE32) { |
| add(c); |
| } |
| } |
| |
| void |
| TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) { |
| // Parallel iteration over prefixes of both tables. |
| UCharsTrie::Iterator prefixes(p, 0, errorCode); |
| UCharsTrie::Iterator basePrefixes(q, 0, errorCode); |
| const UnicodeString *tp = NULL; // Tailoring prefix. |
| const UnicodeString *bp = NULL; // Base prefix. |
| // Use a string with a U+FFFF as the limit sentinel. |
| // U+FFFF is untailorable and will not occur in prefixes. |
| UnicodeString none((UChar)0xffff); |
| for(;;) { |
| if(tp == NULL) { |
| if(prefixes.next(errorCode)) { |
| tp = &prefixes.getString(); |
| } else { |
| tp = &none; |
| } |
| } |
| if(bp == NULL) { |
| if(basePrefixes.next(errorCode)) { |
| bp = &basePrefixes.getString(); |
| } else { |
| bp = &none; |
| } |
| } |
| if(tp == &none && bp == &none) { break; } |
| int32_t cmp = tp->compare(*bp); |
| if(cmp < 0) { |
| // tp occurs in the tailoring but not in the base. |
| addPrefix(data, *tp, c, (uint32_t)prefixes.getValue()); |
| tp = NULL; |
| } else if(cmp > 0) { |
| // bp occurs in the base but not in the tailoring. |
| addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue()); |
| bp = NULL; |
| } else { |
| setPrefix(*tp); |
| compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue()); |
| resetPrefix(); |
| tp = NULL; |
| bp = NULL; |
| } |
| } |
| } |
| |
| void |
| TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) { |
| // Parallel iteration over suffixes of both tables. |
| UCharsTrie::Iterator suffixes(p, 0, errorCode); |
| UCharsTrie::Iterator baseSuffixes(q, 0, errorCode); |
| const UnicodeString *ts = NULL; // Tailoring suffix. |
| const UnicodeString *bs = NULL; // Base suffix. |
| // Use a string with two U+FFFF as the limit sentinel. |
| // U+FFFF is untailorable and will not occur in contractions except maybe |
| // as a single suffix character for a root-collator boundary contraction. |
| UnicodeString none((UChar)0xffff); |
| none.append((UChar)0xffff); |
| for(;;) { |
| if(ts == NULL) { |
| if(suffixes.next(errorCode)) { |
| ts = &suffixes.getString(); |
| } else { |
| ts = &none; |
| } |
| } |
| if(bs == NULL) { |
| if(baseSuffixes.next(errorCode)) { |
| bs = &baseSuffixes.getString(); |
| } else { |
| bs = &none; |
| } |
| } |
| if(ts == &none && bs == &none) { break; } |
| int32_t cmp = ts->compare(*bs); |
| if(cmp < 0) { |
| // ts occurs in the tailoring but not in the base. |
| addSuffix(c, *ts); |
| ts = NULL; |
| } else if(cmp > 0) { |
| // bs occurs in the base but not in the tailoring. |
| addSuffix(c, *bs); |
| bs = NULL; |
| } else { |
| suffix = ts; |
| compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue()); |
| suffix = NULL; |
| ts = NULL; |
| bs = NULL; |
| } |
| } |
| } |
| |
| void |
| TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) { |
| UCharsTrie::Iterator prefixes(p, 0, errorCode); |
| while(prefixes.next(errorCode)) { |
| addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue()); |
| } |
| } |
| |
| void |
| TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) { |
| setPrefix(pfx); |
| ce32 = d->getFinalCE32(ce32); |
| if(Collation::isContractionCE32(ce32)) { |
| const UChar *p = d->contexts + Collation::indexFromCE32(ce32); |
| addContractions(c, p + 2); |
| } |
| tailored->add(UnicodeString(unreversedPrefix).append(c)); |
| resetPrefix(); |
| } |
| |
| void |
| TailoredSet::addContractions(UChar32 c, const UChar *p) { |
| UCharsTrie::Iterator suffixes(p, 0, errorCode); |
| while(suffixes.next(errorCode)) { |
| addSuffix(c, suffixes.getString()); |
| } |
| } |
| |
| void |
| TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) { |
| tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx)); |
| } |
| |
| void |
| TailoredSet::add(UChar32 c) { |
| if(unreversedPrefix.isEmpty() && suffix == NULL) { |
| tailored->add(c); |
| } else { |
| UnicodeString s(unreversedPrefix); |
| s.append(c); |
| if(suffix != NULL) { |
| s.append(*suffix); |
| } |
| tailored->add(s); |
| } |
| } |
| |
| ContractionsAndExpansions::CESink::~CESink() {} |
| |
| U_CDECL_BEGIN |
| |
| static UBool U_CALLCONV |
| enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { |
| ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context; |
| if(cne->checkTailored == 0) { |
| // There is no tailoring. |
| // No need to collect nor check the tailored set. |
| } else if(cne->checkTailored < 0) { |
| // Collect the set of code points with mappings in the tailoring data. |
| if(ce32 == Collation::FALLBACK_CE32) { |
| return TRUE; // fallback to base, not tailored |
| } else { |
| cne->tailored.add(start, end); |
| } |
| // checkTailored > 0: Exclude tailored ranges from the base data enumeration. |
| } else if(start == end) { |
| if(cne->tailored.contains(start)) { |
| return TRUE; |
| } |
| } else if(cne->tailored.containsSome(start, end)) { |
| cne->ranges.set(start, end).removeAll(cne->tailored); |
| int32_t count = cne->ranges.getRangeCount(); |
| for(int32_t i = 0; i < count; ++i) { |
| cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32); |
| } |
| return U_SUCCESS(cne->errorCode); |
| } |
| cne->handleCE32(start, end, ce32); |
| return U_SUCCESS(cne->errorCode); |
| } |
| |
| U_CDECL_END |
| |
| void |
| ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) { |
| if(U_FAILURE(ec)) { return; } |
| errorCode = ec; // Preserve info & warning codes. |
| // Add all from the data, can be tailoring or base. |
| if(d->base != NULL) { |
| checkTailored = -1; |
| } |
| data = d; |
| utrie2_enum(data->trie, NULL, enumCnERange, this); |
| if(d->base == NULL || U_FAILURE(errorCode)) { |
| ec = errorCode; |
| return; |
| } |
| // Add all from the base data but only for un-tailored code points. |
| tailored.freeze(); |
| checkTailored = 1; |
| data = d->base; |
| utrie2_enum(data->trie, NULL, enumCnERange, this); |
| ec = errorCode; |
| } |
| |
| void |
| ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) { |
| if(U_FAILURE(ec)) { return; } |
| errorCode = ec; // Preserve info & warning codes. |
| uint32_t ce32 = d->getCE32(c); |
| if(ce32 == Collation::FALLBACK_CE32) { |
| d = d->base; |
| ce32 = d->getCE32(c); |
| } |
| data = d; |
| handleCE32(c, c, ce32); |
| ec = errorCode; |
| } |
| |
| void |
| ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { |
| for(;;) { |
| if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) { |
| // !isSpecialCE32() |
| if(sink != NULL) { |
| sink->handleCE(Collation::ceFromSimpleCE32(ce32)); |
| } |
| return; |
| } |
| switch(Collation::tagFromCE32(ce32)) { |
| case Collation::FALLBACK_TAG: |
| return; |
| case Collation::RESERVED_TAG_3: |
| case Collation::BUILDER_DATA_TAG: |
| case Collation::LEAD_SURROGATE_TAG: |
| if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } |
| return; |
| case Collation::LONG_PRIMARY_TAG: |
| if(sink != NULL) { |
| sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32)); |
| } |
| return; |
| case Collation::LONG_SECONDARY_TAG: |
| if(sink != NULL) { |
| sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32)); |
| } |
| return; |
| case Collation::LATIN_EXPANSION_TAG: |
| if(sink != NULL) { |
| ces[0] = Collation::latinCE0FromCE32(ce32); |
| ces[1] = Collation::latinCE1FromCE32(ce32); |
| sink->handleExpansion(ces, 2); |
| } |
| // Optimization: If we have a prefix, |
| // then the relevant strings have been added already. |
| if(unreversedPrefix.isEmpty()) { |
| addExpansions(start, end); |
| } |
| return; |
| case Collation::EXPANSION32_TAG: |
| if(sink != NULL) { |
| const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); |
| int32_t length = Collation::lengthFromCE32(ce32); |
| for(int32_t i = 0; i < length; ++i) { |
| ces[i] = Collation::ceFromCE32(*ce32s++); |
| } |
| sink->handleExpansion(ces, length); |
| } |
| // Optimization: If we have a prefix, |
| // then the relevant strings have been added already. |
| if(unreversedPrefix.isEmpty()) { |
| addExpansions(start, end); |
| } |
| return; |
| case Collation::EXPANSION_TAG: |
| if(sink != NULL) { |
| int32_t length = Collation::lengthFromCE32(ce32); |
| sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length); |
| } |
| // Optimization: If we have a prefix, |
| // then the relevant strings have been added already. |
| if(unreversedPrefix.isEmpty()) { |
| addExpansions(start, end); |
| } |
| return; |
| case Collation::PREFIX_TAG: |
| handlePrefixes(start, end, ce32); |
| return; |
| case Collation::CONTRACTION_TAG: |
| handleContractions(start, end, ce32); |
| return; |
| case Collation::DIGIT_TAG: |
| // Fetch the non-numeric-collation CE32 and continue. |
| ce32 = data->ce32s[Collation::indexFromCE32(ce32)]; |
| break; |
| case Collation::U0000_TAG: |
| U_ASSERT(start == 0 && end == 0); |
| // Fetch the normal ce32 for U+0000 and continue. |
| ce32 = data->ce32s[0]; |
| break; |
| case Collation::HANGUL_TAG: |
| if(sink != NULL) { |
| // TODO: This should be optimized, |
| // especially if [start..end] is the complete Hangul range. (assert that) |
| UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL); |
| UChar hangul[1] = { 0 }; |
| for(UChar32 c = start; c <= end; ++c) { |
| hangul[0] = (UChar)c; |
| iter.setText(hangul, hangul + 1); |
| int32_t length = iter.fetchCEs(errorCode); |
| if(U_FAILURE(errorCode)) { return; } |
| // Ignore the terminating non-CE. |
| U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE); |
| sink->handleExpansion(iter.getCEs(), length - 1); |
| } |
| } |
| // Optimization: If we have a prefix, |
| // then the relevant strings have been added already. |
| if(unreversedPrefix.isEmpty()) { |
| addExpansions(start, end); |
| } |
| return; |
| case Collation::OFFSET_TAG: |
| // Currently no need to send offset CEs to the sink. |
| return; |
| case Collation::IMPLICIT_TAG: |
| // Currently no need to send implicit CEs to the sink. |
| return; |
| } |
| } |
| } |
| |
| void |
| ContractionsAndExpansions::handlePrefixes( |
| UChar32 start, UChar32 end, uint32_t ce32) { |
| const UChar *p = data->contexts + Collation::indexFromCE32(ce32); |
| ce32 = CollationData::readCE32(p); // Default if no prefix match. |
| handleCE32(start, end, ce32); |
| if(!addPrefixes) { return; } |
| UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); |
| while(prefixes.next(errorCode)) { |
| setPrefix(prefixes.getString()); |
| // Prefix/pre-context mappings are special kinds of contractions |
| // that always yield expansions. |
| addStrings(start, end, contractions); |
| addStrings(start, end, expansions); |
| handleCE32(start, end, (uint32_t)prefixes.getValue()); |
| } |
| resetPrefix(); |
| } |
| |
| void |
| ContractionsAndExpansions::handleContractions( |
| UChar32 start, UChar32 end, uint32_t ce32) { |
| const UChar *p = data->contexts + Collation::indexFromCE32(ce32); |
| if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { |
| // No match on the single code point. |
| // We are underneath a prefix, and the default mapping is just |
| // a fallback to the mappings for a shorter prefix. |
| U_ASSERT(!unreversedPrefix.isEmpty()); |
| } else { |
| ce32 = CollationData::readCE32(p); // Default if no suffix match. |
| U_ASSERT(!Collation::isContractionCE32(ce32)); |
| handleCE32(start, end, ce32); |
| } |
| UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); |
| while(suffixes.next(errorCode)) { |
| suffix = &suffixes.getString(); |
| addStrings(start, end, contractions); |
| if(!unreversedPrefix.isEmpty()) { |
| addStrings(start, end, expansions); |
| } |
| handleCE32(start, end, (uint32_t)suffixes.getValue()); |
| } |
| suffix = NULL; |
| } |
| |
| void |
| ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) { |
| if(unreversedPrefix.isEmpty() && suffix == NULL) { |
| if(expansions != NULL) { |
| expansions->add(start, end); |
| } |
| } else { |
| addStrings(start, end, expansions); |
| } |
| } |
| |
| void |
| ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) { |
| if(set == NULL) { return; } |
| UnicodeString s(unreversedPrefix); |
| do { |
| s.append(start); |
| if(suffix != NULL) { |
| s.append(*suffix); |
| } |
| set->add(s); |
| s.truncate(unreversedPrefix.length()); |
| } while(++start <= end); |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif // !UCONFIG_NO_COLLATION |