|  | /* | 
|  | ******************************************************************************* | 
|  | * Copyright (C) 2012-2015, International Business Machines | 
|  | * Corporation and others.  All Rights Reserved. | 
|  | ******************************************************************************* | 
|  | * collationtest.cpp | 
|  | * | 
|  | * created on: 2012apr27 | 
|  | * created by: Markus W. Scherer | 
|  | */ | 
|  |  | 
|  | #include "unicode/utypes.h" | 
|  |  | 
|  | #if !UCONFIG_NO_COLLATION | 
|  |  | 
|  | #include "unicode/coll.h" | 
|  | #include "unicode/errorcode.h" | 
|  | #include "unicode/localpointer.h" | 
|  | #include "unicode/normalizer2.h" | 
|  | #include "unicode/sortkey.h" | 
|  | #include "unicode/std_string.h" | 
|  | #include "unicode/strenum.h" | 
|  | #include "unicode/tblcoll.h" | 
|  | #include "unicode/uiter.h" | 
|  | #include "unicode/uniset.h" | 
|  | #include "unicode/unistr.h" | 
|  | #include "unicode/usetiter.h" | 
|  | #include "unicode/ustring.h" | 
|  | #include "charstr.h" | 
|  | #include "cmemory.h" | 
|  | #include "collation.h" | 
|  | #include "collationdata.h" | 
|  | #include "collationfcd.h" | 
|  | #include "collationiterator.h" | 
|  | #include "collationroot.h" | 
|  | #include "collationrootelements.h" | 
|  | #include "collationruleparser.h" | 
|  | #include "collationweights.h" | 
|  | #include "cstring.h" | 
|  | #include "intltest.h" | 
|  | #include "normalizer2impl.h" | 
|  | #include "ucbuf.h" | 
|  | #include "uhash.h" | 
|  | #include "uitercollationiterator.h" | 
|  | #include "utf16collationiterator.h" | 
|  | #include "utf8collationiterator.h" | 
|  | #include "uvectr32.h" | 
|  | #include "uvectr64.h" | 
|  | #include "writesrc.h" | 
|  |  | 
|  | class CodePointIterator; | 
|  |  | 
|  | // TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey) | 
|  |  | 
|  | class CollationTest : public IntlTest { | 
|  | public: | 
|  | CollationTest() | 
|  | : fcd(NULL), nfd(NULL), | 
|  | fileLineNumber(0), | 
|  | coll(NULL) {} | 
|  |  | 
|  | ~CollationTest() { | 
|  | delete coll; | 
|  | } | 
|  |  | 
|  | void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL); | 
|  |  | 
|  | void TestMinMax(); | 
|  | void TestImplicits(); | 
|  | void TestNulTerminated(); | 
|  | void TestIllegalUTF8(); | 
|  | void TestShortFCDData(); | 
|  | void TestFCD(); | 
|  | void TestCollationWeights(); | 
|  | void TestRootElements(); | 
|  | void TestTailoredElements(); | 
|  | void TestDataDriven(); | 
|  |  | 
|  | private: | 
|  | void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi); | 
|  | void checkAllocWeights(CollationWeights &cw, | 
|  | uint32_t lowerLimit, uint32_t upperLimit, int32_t n, | 
|  | int32_t someLength, int32_t minCount); | 
|  |  | 
|  | static UnicodeString printSortKey(const uint8_t *p, int32_t length); | 
|  | static UnicodeString printCollationKey(const CollationKey &key); | 
|  |  | 
|  | // Helpers & fields for data-driven test. | 
|  | static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; } | 
|  | static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; } | 
|  | static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@ | 
|  | int32_t skipSpaces(int32_t i) { | 
|  | while(isSpace(fileLine[i])) { ++i; } | 
|  | return i; | 
|  | } | 
|  |  | 
|  | UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode); | 
|  | void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode); | 
|  | Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode); | 
|  | void parseAndSetAttribute(IcuTestErrorCode &errorCode); | 
|  | void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode); | 
|  | void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode); | 
|  | void setRootCollator(IcuTestErrorCode &errorCode); | 
|  | void setLocaleCollator(IcuTestErrorCode &errorCode); | 
|  |  | 
|  | UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const; | 
|  |  | 
|  | UBool getSortKeyParts(const UChar *s, int32_t length, | 
|  | CharString &dest, int32_t partSize, | 
|  | IcuTestErrorCode &errorCode); | 
|  | UBool getCollationKey(const char *norm, const UnicodeString &line, | 
|  | const UChar *s, int32_t length, | 
|  | CollationKey &key, IcuTestErrorCode &errorCode); | 
|  | UBool getMergedCollationKey(const UChar *s, int32_t length, | 
|  | CollationKey &key, IcuTestErrorCode &errorCode); | 
|  | UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, | 
|  | const UnicodeString &prevString, const UnicodeString &s, | 
|  | UCollationResult expectedOrder, Collation::Level expectedLevel, | 
|  | IcuTestErrorCode &errorCode); | 
|  | void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode); | 
|  |  | 
|  | const Normalizer2 *fcd, *nfd; | 
|  | UnicodeString fileLine; | 
|  | int32_t fileLineNumber; | 
|  | UnicodeString fileTestName; | 
|  | Collator *coll; | 
|  | }; | 
|  |  | 
|  | extern IntlTest *createCollationTest() { | 
|  | return new CollationTest(); | 
|  | } | 
|  |  | 
|  | void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) { | 
|  | if(exec) { | 
|  | logln("TestSuite CollationTest: "); | 
|  | } | 
|  | TESTCASE_AUTO_BEGIN; | 
|  | TESTCASE_AUTO(TestMinMax); | 
|  | TESTCASE_AUTO(TestImplicits); | 
|  | TESTCASE_AUTO(TestNulTerminated); | 
|  | TESTCASE_AUTO(TestIllegalUTF8); | 
|  | TESTCASE_AUTO(TestShortFCDData); | 
|  | TESTCASE_AUTO(TestFCD); | 
|  | TESTCASE_AUTO(TestCollationWeights); | 
|  | TESTCASE_AUTO(TestRootElements); | 
|  | TESTCASE_AUTO(TestTailoredElements); | 
|  | TESTCASE_AUTO(TestDataDriven); | 
|  | TESTCASE_AUTO_END; | 
|  | } | 
|  |  | 
|  | void CollationTest::TestMinMax() { | 
|  | IcuTestErrorCode errorCode(*this, "TestMinMax"); | 
|  |  | 
|  | setRootCollator(errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | errorCode.reset(); | 
|  | return; | 
|  | } | 
|  | RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll); | 
|  | if(rbc == NULL) { | 
|  | errln("the root collator is not a RuleBasedCollator"); | 
|  | return; | 
|  | } | 
|  |  | 
|  | static const UChar s[2] = { 0xfffe, 0xffff }; | 
|  | UVector64 ces(errorCode); | 
|  | rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode); | 
|  | errorCode.assertSuccess(); | 
|  | if(ces.size() != 2) { | 
|  | errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size()); | 
|  | return; | 
|  | } | 
|  | int64_t ce = ces.elementAti(0); | 
|  | int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY); | 
|  | if(ce != expected) { | 
|  | errln("CE(U+fffe)=%04lx != 02..", (long)ce); | 
|  | } | 
|  |  | 
|  | ce = ces.elementAti(1); | 
|  | expected = Collation::makeCE(Collation::MAX_PRIMARY); | 
|  | if(ce != expected) { | 
|  | errln("CE(U+ffff)=%04lx != max..", (long)ce); | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::TestImplicits() { | 
|  | IcuTestErrorCode errorCode(*this, "TestImplicits"); | 
|  |  | 
|  | const CollationData *cd = CollationRoot::getData(errorCode); | 
|  | if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Implicit primary weights should be assigned for the following sets, | 
|  | // and sort in ascending order by set and then code point. | 
|  | // See http://www.unicode.org/reports/tr10/#Implicit_Weights | 
|  |  | 
|  | // core Han Unified Ideographs | 
|  | UnicodeSet coreHan("[\\p{unified_ideograph}&" | 
|  | "[\\p{Block=CJK_Unified_Ideographs}" | 
|  | "\\p{Block=CJK_Compatibility_Ideographs}]]", | 
|  | errorCode); | 
|  | // all other Unified Han ideographs | 
|  | UnicodeSet otherHan("[\\p{unified ideograph}-" | 
|  | "[\\p{Block=CJK_Unified_Ideographs}" | 
|  | "\\p{Block=CJK_Compatibility_Ideographs}]]", | 
|  | errorCode); | 
|  | UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode); | 
|  | unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings. | 
|  |  | 
|  | // Starting with CLDR 26/ICU 54, the root Han order may instead be | 
|  | // the Unihan radical-stroke order. | 
|  | // The tests should pass either way, so we only test the order of a small set of Han characters | 
|  | // whose radical-stroke order is the same as their code point order. | 
|  | UnicodeSet someHanInCPOrder( | 
|  | "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" | 
|  | "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]", | 
|  | errorCode); | 
|  | UnicodeSet inOrder(someHanInCPOrder); | 
|  | inOrder.addAll(unassigned).freeze(); | 
|  | if(errorCode.logIfFailureAndReset("UnicodeSet")) { | 
|  | return; | 
|  | } | 
|  | const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned }; | 
|  | UChar32 prev = 0; | 
|  | uint32_t prevPrimary = 0; | 
|  | UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL); | 
|  | for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) { | 
|  | LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i])); | 
|  | while(iter->next()) { | 
|  | UChar32 c = iter->getCodepoint(); | 
|  | UnicodeString s(c); | 
|  | ci.setText(s.getBuffer(), s.getBuffer() + s.length()); | 
|  | int64_t ce = ci.nextCE(errorCode); | 
|  | int64_t ce2 = ci.nextCE(errorCode); | 
|  | if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { | 
|  | return; | 
|  | } | 
|  | if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) { | 
|  | errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c); | 
|  | continue; | 
|  | } | 
|  | if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) { | 
|  | errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx", | 
|  | (long)c, (long)(ce & 0xffffffff)); | 
|  | continue; | 
|  | } | 
|  | uint32_t primary = (uint32_t)(ce >> 32); | 
|  | if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) { | 
|  | errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..", | 
|  | (long)c, (long)primary, (long)prev, (long)prevPrimary); | 
|  | } | 
|  | prev = c; | 
|  | prevPrimary = primary; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::TestNulTerminated() { | 
|  | IcuTestErrorCode errorCode(*this, "TestNulTerminated"); | 
|  | const CollationData *data = CollationRoot::getData(errorCode); | 
|  | if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 }; | 
|  |  | 
|  | UTF16CollationIterator ci1(data, FALSE, s, s, s + 2); | 
|  | UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL); | 
|  | for(int32_t i = 0;; ++i) { | 
|  | int64_t ce1 = ci1.nextCE(errorCode); | 
|  | int64_t ce2 = ci2.nextCE(errorCode); | 
|  | if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { | 
|  | return; | 
|  | } | 
|  | if(ce1 != ce2) { | 
|  | errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i); | 
|  | break; | 
|  | } | 
|  | if(ce1 == Collation::NO_CE) { break; } | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::TestIllegalUTF8() { | 
|  | IcuTestErrorCode errorCode(*this, "TestIllegalUTF8"); | 
|  |  | 
|  | setRootCollator(errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | errorCode.reset(); | 
|  | return; | 
|  | } | 
|  | coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode); | 
|  |  | 
|  | static const char *strings[] = { | 
|  | // U+FFFD | 
|  | "a\xef\xbf\xbdz", | 
|  | // illegal byte sequences | 
|  | "a\x80z",  // trail byte | 
|  | "a\xc1\x81z",  // non-shortest form | 
|  | "a\xe0\x82\x83z",  // non-shortest form | 
|  | "a\xed\xa0\x80z",  // lead surrogate: would be U+D800 | 
|  | "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF | 
|  | "a\xf0\x8f\xbf\xbfz",  // non-shortest form | 
|  | "a\xf4\x90\x80\x80z"  // out of range: would be U+110000 | 
|  | }; | 
|  |  | 
|  | StringPiece fffd(strings[0]); | 
|  | for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) { | 
|  | StringPiece illegal(strings[i]); | 
|  | UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode); | 
|  | if(order != UCOL_EQUAL) { | 
|  | errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL", | 
|  | (int)i, order); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) { | 
|  | for(UChar32 c = 0x10000; c < 0x110000;) { | 
|  | UChar32 next = c + 0x400; | 
|  | if(src.containsSome(c, next - 1)) { | 
|  | dest.add(U16_LEAD(c)); | 
|  | } | 
|  | c = next; | 
|  | } | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | void CollationTest::TestShortFCDData() { | 
|  | // See CollationFCD class comments. | 
|  | IcuTestErrorCode errorCode(*this, "TestShortFCDData"); | 
|  | UnicodeSet expectedLccc("[:^lccc=0:]", errorCode); | 
|  | errorCode.assertSuccess(); | 
|  | expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates | 
|  | addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc); | 
|  | UnicodeSet lccc;  // actual | 
|  | for(UChar32 c = 0; c <= 0xffff; ++c) { | 
|  | if(CollationFCD::hasLccc(c)) { lccc.add(c); } | 
|  | } | 
|  | UnicodeSet diff(expectedLccc); | 
|  | diff.removeAll(lccc); | 
|  | diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP | 
|  | UnicodeString empty("[]"); | 
|  | UnicodeString diffString; | 
|  | diff.toPattern(diffString, TRUE); | 
|  | assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString); | 
|  | diff = lccc; | 
|  | diff.removeAll(expectedLccc); | 
|  | diff.toPattern(diffString, TRUE); | 
|  | assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE); | 
|  |  | 
|  | UnicodeSet expectedTccc("[:^tccc=0:]", errorCode); | 
|  | if (errorCode.isSuccess()) { | 
|  | addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc); | 
|  | addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc); | 
|  | UnicodeSet tccc;  // actual | 
|  | for(UChar32 c = 0; c <= 0xffff; ++c) { | 
|  | if(CollationFCD::hasTccc(c)) { tccc.add(c); } | 
|  | } | 
|  | diff = expectedTccc; | 
|  | diff.removeAll(tccc); | 
|  | diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP | 
|  | assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString); | 
|  | diff = tccc; | 
|  | diff.removeAll(expectedTccc); | 
|  | diff.toPattern(diffString, TRUE); | 
|  | assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString); | 
|  | } | 
|  | } | 
|  |  | 
|  | class CodePointIterator { | 
|  | public: | 
|  | CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {} | 
|  | void resetToStart() { pos = 0; } | 
|  | UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; } | 
|  | UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; } | 
|  | int32_t getLength() const { return length; } | 
|  | int getIndex() const { return (int)pos; } | 
|  | private: | 
|  | const UChar32 *cp; | 
|  | int32_t length; | 
|  | int32_t pos; | 
|  | }; | 
|  |  | 
|  | void CollationTest::checkFCD(const char *name, | 
|  | CollationIterator &ci, CodePointIterator &cpi) { | 
|  | IcuTestErrorCode errorCode(*this, "checkFCD"); | 
|  |  | 
|  | // Iterate forward to the limit. | 
|  | for(;;) { | 
|  | UChar32 c1 = ci.nextCodePoint(errorCode); | 
|  | UChar32 c2 = cpi.next(); | 
|  | if(c1 != c2) { | 
|  | errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d", | 
|  | name, (long)c1, (long)c2, cpi.getIndex()); | 
|  | return; | 
|  | } | 
|  | if(c1 < 0) { break; } | 
|  | } | 
|  |  | 
|  | // Iterate backward most of the way. | 
|  | for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) { | 
|  | UChar32 c1 = ci.previousCodePoint(errorCode); | 
|  | UChar32 c2 = cpi.previous(); | 
|  | if(c1 != c2) { | 
|  | errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d", | 
|  | name, (long)c1, (long)c2, cpi.getIndex()); | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Forward again. | 
|  | for(;;) { | 
|  | UChar32 c1 = ci.nextCodePoint(errorCode); | 
|  | UChar32 c2 = cpi.next(); | 
|  | if(c1 != c2) { | 
|  | errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d", | 
|  | name, (long)c1, (long)c2, cpi.getIndex()); | 
|  | return; | 
|  | } | 
|  | if(c1 < 0) { break; } | 
|  | } | 
|  |  | 
|  | // Iterate backward to the start. | 
|  | for(;;) { | 
|  | UChar32 c1 = ci.previousCodePoint(errorCode); | 
|  | UChar32 c2 = cpi.previous(); | 
|  | if(c1 != c2) { | 
|  | errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d", | 
|  | name, (long)c1, (long)c2, cpi.getIndex()); | 
|  | return; | 
|  | } | 
|  | if(c1 < 0) { break; } | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::TestFCD() { | 
|  | IcuTestErrorCode errorCode(*this, "TestFCD"); | 
|  | const CollationData *data = CollationRoot::getData(errorCode); | 
|  | if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Input string, not FCD, NUL-terminated. | 
|  | static const UChar s[] = { | 
|  | 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62, | 
|  | U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216 | 
|  | 0x327, 0x308,  // ccc=202, 230 | 
|  | U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226 | 
|  | U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), | 
|  | U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), | 
|  | 0xac01, | 
|  | 0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence. | 
|  | U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165), | 
|  | 0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence. | 
|  | 0xf73, 0xf75,  // Tibetan composite vowels must be decomposed. | 
|  | 0x4e00, 0xf81, | 
|  | 0 | 
|  | }; | 
|  | // Expected code points. | 
|  | static const UChar32 cp[] = { | 
|  | 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62, | 
|  | 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308, | 
|  | 0x1D15F, 0x1D16D, | 
|  | 0xac01, | 
|  | 0x63, 0x327, 0x1D165, 0x1D16D, | 
|  | 0x61, | 
|  | 0xf71, 0xf71, 0xf72, 0xf74, 0x301, | 
|  | 0x4e00, 0xf71, 0xf80 | 
|  | }; | 
|  |  | 
|  | FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL); | 
|  | if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) { | 
|  | return; | 
|  | } | 
|  | CodePointIterator cpi(cp, UPRV_LENGTHOF(cp)); | 
|  | checkFCD("FCDUTF16CollationIterator", u16ci, cpi); | 
|  |  | 
|  | #if U_HAVE_STD_STRING | 
|  | cpi.resetToStart(); | 
|  | std::string utf8; | 
|  | UnicodeString(s).toUTF8String(utf8); | 
|  | FCDUTF8CollationIterator u8ci(data, FALSE, | 
|  | reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1); | 
|  | if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) { | 
|  | return; | 
|  | } | 
|  | checkFCD("FCDUTF8CollationIterator", u8ci, cpi); | 
|  | #endif | 
|  |  | 
|  | cpi.resetToStart(); | 
|  | UCharIterator iter; | 
|  | uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL | 
|  | FCDUIterCollationIterator uici(data, FALSE, iter, 0); | 
|  | if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) { | 
|  | return; | 
|  | } | 
|  | checkFCD("FCDUIterCollationIterator", uici, cpi); | 
|  | } | 
|  |  | 
|  | void CollationTest::checkAllocWeights(CollationWeights &cw, | 
|  | uint32_t lowerLimit, uint32_t upperLimit, int32_t n, | 
|  | int32_t someLength, int32_t minCount) { | 
|  | if(!cw.allocWeights(lowerLimit, upperLimit, n)) { | 
|  | errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE", | 
|  | (long)lowerLimit, (long)upperLimit, (long)n); | 
|  | return; | 
|  | } | 
|  | uint32_t previous = lowerLimit; | 
|  | int32_t count = 0;  // number of weights that have someLength | 
|  | for(int32_t i = 0; i < n; ++i) { | 
|  | uint32_t w = cw.nextWeight(); | 
|  | if(w == 0xffffffff) { | 
|  | errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " | 
|  | "returns only %ld weights", | 
|  | (long)lowerLimit, (long)upperLimit, (long)n, (long)i); | 
|  | return; | 
|  | } | 
|  | if(!(previous < w && w < upperLimit)) { | 
|  | errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " | 
|  | "number %ld -> %lx not between %lx and %lx", | 
|  | (long)lowerLimit, (long)upperLimit, (long)n, | 
|  | (long)(i + 1), (long)w, (long)previous, (long)upperLimit); | 
|  | return; | 
|  | } | 
|  | if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; } | 
|  | } | 
|  | if(count < minCount) { | 
|  | errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " | 
|  | "returns only %ld < %ld weights of length %d", | 
|  | (long)lowerLimit, (long)upperLimit, (long)n, | 
|  | (long)count, (long)minCount, (int)someLength); | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::TestCollationWeights() { | 
|  | CollationWeights cw; | 
|  |  | 
|  | // Non-compressible primaries use 254 second bytes 02..FF. | 
|  | logln("CollationWeights.initForPrimary(non-compressible)"); | 
|  | cw.initForPrimary(FALSE); | 
|  | // Expect 1 weight 11 and 254 weights 12xx. | 
|  | checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1); | 
|  | checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254); | 
|  | // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202. | 
|  | checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255); | 
|  | // Expect 254 two-byte weights from the ranges 10ff and 11xx. | 
|  | checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254); | 
|  | // Expect 254^2=64516 three-byte weights. | 
|  | // During computation, there should be 3 three-byte ranges | 
|  | // 10ffff, 11xxxx, 120202. | 
|  | // The middle one should be split 64515:1, | 
|  | // and the newly-split-off range and the last ranged lengthened. | 
|  | checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516); | 
|  | // Expect weights 1102 & 1103. | 
|  | checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2); | 
|  | // Expect weights 102102 & 102103. | 
|  | checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); | 
|  |  | 
|  | // Compressible primaries use 251 second bytes 04..FE. | 
|  | logln("CollationWeights.initForPrimary(compressible)"); | 
|  | cw.initForPrimary(TRUE); | 
|  | // Expect 1 weight 11 and 251 weights 12xx. | 
|  | checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1); | 
|  | checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251); | 
|  | // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204. | 
|  | checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252); | 
|  | // Expect weights 1104 & 1105. | 
|  | checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2); | 
|  | // Expect weights 102102 & 102103. | 
|  | checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); | 
|  |  | 
|  | // Secondary and tertiary weights use only bytes 3 & 4. | 
|  | logln("CollationWeights.initForSecondary()"); | 
|  | cw.initForSecondary(); | 
|  | // Expect weights fbxx and all four fc..ff. | 
|  | checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4); | 
|  |  | 
|  | logln("CollationWeights.initForTertiary()"); | 
|  | cw.initForTertiary(); | 
|  | // Expect weights 3dxx and both 3e & 3f. | 
|  | checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2); | 
|  | } | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | UBool isValidCE(const CollationRootElements &re, const CollationData &data, | 
|  | uint32_t p, uint32_t s, uint32_t ctq) { | 
|  | uint32_t p1 = p >> 24; | 
|  | uint32_t p2 = (p >> 16) & 0xff; | 
|  | uint32_t p3 = (p >> 8) & 0xff; | 
|  | uint32_t p4 = p & 0xff; | 
|  | uint32_t s1 = s >> 8; | 
|  | uint32_t s2 = s & 0xff; | 
|  | // ctq = Case, Tertiary, Quaternary | 
|  | uint32_t c = (ctq & Collation::CASE_MASK) >> 14; | 
|  | uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK; | 
|  | uint32_t t1 = t >> 8; | 
|  | uint32_t t2 = t & 0xff; | 
|  | uint32_t q = ctq & Collation::QUATERNARY_MASK; | 
|  | // No leading zero bytes. | 
|  | if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) { | 
|  | return FALSE; | 
|  | } | 
|  | // No intermediate zero bytes. | 
|  | if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) { | 
|  | return FALSE; | 
|  | } | 
|  | if(p2 != 0 && p3 == 0 && p4 != 0) { | 
|  | return FALSE; | 
|  | } | 
|  | // Minimum & maximum lead bytes. | 
|  | if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) || | 
|  | s1 == Collation::LEVEL_SEPARATOR_BYTE || | 
|  | t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) { | 
|  | return FALSE; | 
|  | } | 
|  | if(c > 2) { | 
|  | return FALSE; | 
|  | } | 
|  | // The valid byte range for the second primary byte depends on compressibility. | 
|  | if(p2 != 0) { | 
|  | if(data.isCompressibleLeadByte(p1)) { | 
|  | if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE || | 
|  | Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) { | 
|  | return FALSE; | 
|  | } | 
|  | } else { | 
|  | if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) { | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | } | 
|  | // Other bytes just need to avoid the level separator. | 
|  | // Trailing zeros are ok. | 
|  | U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1); | 
|  | if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE || | 
|  | s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) { | 
|  | return FALSE; | 
|  | } | 
|  | // Well-formed CEs. | 
|  | if(p == 0) { | 
|  | if(s == 0) { | 
|  | if(t == 0) { | 
|  | // Completely ignorable CE. | 
|  | // Quaternary CEs are not supported. | 
|  | if(c != 0 || q != 0) { | 
|  | return FALSE; | 
|  | } | 
|  | } else { | 
|  | // Tertiary CE. | 
|  | if(t < re.getTertiaryBoundary() || c != 2) { | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | } else { | 
|  | // Secondary CE. | 
|  | if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) { | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | } else { | 
|  | // Primary CE. | 
|  | if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) || | 
|  | s >= re.getSecondaryBoundary()) { | 
|  | return FALSE; | 
|  | } | 
|  | if(t == 0 || t >= re.getTertiaryBoundary()) { | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) { | 
|  | uint32_t p = (uint32_t)(ce >> 32); | 
|  | uint32_t secTer = (uint32_t)ce; | 
|  | return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff); | 
|  | } | 
|  |  | 
|  | class RootElementsIterator { | 
|  | public: | 
|  | RootElementsIterator(const CollationData &root) | 
|  | : data(root), | 
|  | elements(root.rootElements), length(root.rootElementsLength), | 
|  | pri(0), secTer(0), | 
|  | index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {} | 
|  |  | 
|  | UBool next() { | 
|  | if(index >= length) { return FALSE; } | 
|  | uint32_t p = elements[index]; | 
|  | if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; } | 
|  | if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) { | 
|  | ++index; | 
|  | secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG; | 
|  | return TRUE; | 
|  | } | 
|  | if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) { | 
|  | // End of a range, enumerate the primaries in the range. | 
|  | int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK; | 
|  | p &= 0xffffff00; | 
|  | if(pri == p) { | 
|  | // Finished the range, return the next CE after it. | 
|  | ++index; | 
|  | return next(); | 
|  | } | 
|  | U_ASSERT(pri < p); | 
|  | // Return the next primary in this range. | 
|  | UBool isCompressible = data.isCompressiblePrimary(pri); | 
|  | if((pri & 0xffff) == 0) { | 
|  | pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step); | 
|  | } else { | 
|  | pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step); | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  | // Simple primary CE. | 
|  | ++index; | 
|  | pri = p; | 
|  | // Does this have an explicit below-common sec/ter unit, | 
|  | // or does it imply a common one? | 
|  | if(index == length) { | 
|  | secTer = Collation::COMMON_SEC_AND_TER_CE; | 
|  | } else { | 
|  | secTer = elements[index]; | 
|  | if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) { | 
|  | // No sec/ter delta. | 
|  | secTer = Collation::COMMON_SEC_AND_TER_CE; | 
|  | } else { | 
|  | secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG; | 
|  | if(secTer > Collation::COMMON_SEC_AND_TER_CE) { | 
|  | // Implied sec/ter. | 
|  | secTer = Collation::COMMON_SEC_AND_TER_CE; | 
|  | } else { | 
|  | // Explicit sec/ter below common/common. | 
|  | ++index; | 
|  | } | 
|  | } | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | uint32_t getPrimary() const { return pri; } | 
|  | uint32_t getSecTer() const { return secTer; } | 
|  |  | 
|  | private: | 
|  | const CollationData &data; | 
|  | const uint32_t *elements; | 
|  | int32_t length; | 
|  |  | 
|  | uint32_t pri; | 
|  | uint32_t secTer; | 
|  | int32_t index; | 
|  | }; | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | void CollationTest::TestRootElements() { | 
|  | IcuTestErrorCode errorCode(*this, "TestRootElements"); | 
|  | const CollationData *root = CollationRoot::getData(errorCode); | 
|  | if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | 
|  | return; | 
|  | } | 
|  | CollationRootElements rootElements(root->rootElements, root->rootElementsLength); | 
|  | RootElementsIterator iter(*root); | 
|  |  | 
|  | // We check each root CE for validity, | 
|  | // and we also verify that there is a tailoring gap between each two CEs. | 
|  | CollationWeights cw1c;  // compressible primary weights | 
|  | CollationWeights cw1u;  // uncompressible primary weights | 
|  | CollationWeights cw2; | 
|  | CollationWeights cw3; | 
|  |  | 
|  | cw1c.initForPrimary(TRUE); | 
|  | cw1u.initForPrimary(FALSE); | 
|  | cw2.initForSecondary(); | 
|  | cw3.initForTertiary(); | 
|  |  | 
|  | // Note: The root elements do not include Han-implicit or unassigned-implicit CEs, | 
|  | // nor the special merge-separator CE for U+FFFE. | 
|  | uint32_t prevPri = 0; | 
|  | uint32_t prevSec = 0; | 
|  | uint32_t prevTer = 0; | 
|  | while(iter.next()) { | 
|  | uint32_t pri = iter.getPrimary(); | 
|  | uint32_t secTer = iter.getSecTer(); | 
|  | // CollationRootElements CEs must have 0 case and quaternary bits. | 
|  | if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) { | 
|  | errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx", | 
|  | (long)pri, (long)secTer); | 
|  | } | 
|  | uint32_t sec = secTer >> 16; | 
|  | uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK; | 
|  | uint32_t ctq = ter; | 
|  | if(pri == 0 && sec == 0 && ter != 0) { | 
|  | // Tertiary CEs must have uppercase bits, | 
|  | // but they are not stored in the CollationRootElements. | 
|  | ctq |= 0x8000; | 
|  | } | 
|  | if(!isValidCE(rootElements, *root, pri, sec, ctq)) { | 
|  | errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer); | 
|  | } else { | 
|  | if(pri != prevPri) { | 
|  | uint32_t newWeight = 0; | 
|  | if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) { | 
|  | // There is currently no tailoring gap after primary ignorables, | 
|  | // and we forbid tailoring after U+FFFD and U+FFFF. | 
|  | } else if(root->isCompressiblePrimary(prevPri)) { | 
|  | if(!cw1c.allocWeights(prevPri, pri, 1)) { | 
|  | errln("no primary/compressible tailoring gap between %08lx and %08lx", | 
|  | (long)prevPri, (long)pri); | 
|  | } else { | 
|  | newWeight = cw1c.nextWeight(); | 
|  | } | 
|  | } else { | 
|  | if(!cw1u.allocWeights(prevPri, pri, 1)) { | 
|  | errln("no primary/uncompressible tailoring gap between %08lx and %08lx", | 
|  | (long)prevPri, (long)pri); | 
|  | } else { | 
|  | newWeight = cw1u.nextWeight(); | 
|  | } | 
|  | } | 
|  | if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) { | 
|  | errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx", | 
|  | (long)prevPri, (long)newWeight, (long)pri); | 
|  | } | 
|  | } else if(sec != prevSec) { | 
|  | uint32_t lowerLimit = | 
|  | prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec; | 
|  | if(!cw2.allocWeights(lowerLimit, sec, 1)) { | 
|  | errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec); | 
|  | } else { | 
|  | uint32_t newWeight = cw2.nextWeight(); | 
|  | if(!(prevSec < newWeight && newWeight < sec)) { | 
|  | errln("mis-allocated secondary weight, should get %04x < %04x < %04x", | 
|  | (long)lowerLimit, (long)newWeight, (long)sec); | 
|  | } | 
|  | } | 
|  | } else if(ter != prevTer) { | 
|  | uint32_t lowerLimit = | 
|  | prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer; | 
|  | if(!cw3.allocWeights(lowerLimit, ter, 1)) { | 
|  | errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter); | 
|  | } else { | 
|  | uint32_t newWeight = cw3.nextWeight(); | 
|  | if(!(prevTer < newWeight && newWeight < ter)) { | 
|  | errln("mis-allocated secondary weight, should get %04x < %04x < %04x", | 
|  | (long)lowerLimit, (long)newWeight, (long)ter); | 
|  | } | 
|  | } | 
|  | } else { | 
|  | errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer); | 
|  | } | 
|  | } | 
|  | prevPri = pri; | 
|  | prevSec = sec; | 
|  | prevTer = ter; | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::TestTailoredElements() { | 
|  | IcuTestErrorCode errorCode(*this, "TestTailoredElements"); | 
|  | const CollationData *root = CollationRoot::getData(errorCode); | 
|  | if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { | 
|  | return; | 
|  | } | 
|  | CollationRootElements rootElements(root->rootElements, root->rootElementsLength); | 
|  |  | 
|  | UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode); | 
|  | if(errorCode.logIfFailureAndReset("failed to create a hash table")) { | 
|  | return; | 
|  | } | 
|  | uhash_setKeyDeleter(prevLocales, uprv_free); | 
|  | // TestRootElements() tests the root collator which does not have tailorings. | 
|  | uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode); | 
|  | uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode); | 
|  | uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode); | 
|  |  | 
|  | UVector64 ces(errorCode); | 
|  | LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales()); | 
|  | U_ASSERT(locales.isValid()); | 
|  | const char *localeID = "root"; | 
|  | do { | 
|  | Locale locale(localeID); | 
|  | LocalPointer<StringEnumeration> types( | 
|  | Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode)); | 
|  | errorCode.assertSuccess(); | 
|  | const char *type;  // first: default type | 
|  | while((type = types->next(NULL, errorCode)) != NULL) { | 
|  | if(strncmp(type, "private-", 8) == 0) { | 
|  | errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s", | 
|  | localeID, type); | 
|  | } | 
|  | Locale localeWithType(locale); | 
|  | localeWithType.setKeywordValue("collation", type, errorCode); | 
|  | errorCode.assertSuccess(); | 
|  | LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode)); | 
|  | if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)", | 
|  | localeWithType.getName())) { | 
|  | continue; | 
|  | } | 
|  | Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode); | 
|  | if(uhash_geti(prevLocales, actual.getName()) != 0) { | 
|  | continue; | 
|  | } | 
|  | uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode); | 
|  | errorCode.assertSuccess(); | 
|  | logln("TestTailoredElements(): requested %s -> actual %s", | 
|  | localeWithType.getName(), actual.getName()); | 
|  | RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias()); | 
|  | if(rbc == NULL) { | 
|  | continue; | 
|  | } | 
|  | // Note: It would be better to get tailored strings such that we can | 
|  | // identify the prefix, and only get the CEs for the prefix+string, | 
|  | // not also for the prefix. | 
|  | // There is currently no API for that. | 
|  | // It would help in an unusual case where a contraction starting in the prefix | 
|  | // extends past its end, and we do not see the intended mapping. | 
|  | // For example, for a mapping p|st, if there is also a contraction ps, | 
|  | // then we get CEs(ps)+CEs(t), rather than CEs(p|st). | 
|  | LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode)); | 
|  | errorCode.assertSuccess(); | 
|  | UnicodeSetIterator iter(*tailored); | 
|  | while(iter.next()) { | 
|  | const UnicodeString &s = iter.getString(); | 
|  | ces.removeAllElements(); | 
|  | rbc->internalGetCEs(s, ces, errorCode); | 
|  | errorCode.assertSuccess(); | 
|  | for(int32_t i = 0; i < ces.size(); ++i) { | 
|  | int64_t ce = ces.elementAti(i); | 
|  | if(!isValidCE(rootElements, *root, ce)) { | 
|  | errln("invalid tailored CE %016llx at CE index %d from string:", | 
|  | (long long)ce, (int)i); | 
|  | infoln(prettify(s)); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } while((localeID = locales->next(NULL, errorCode)) != NULL); | 
|  | uhash_close(prevLocales); | 
|  | } | 
|  |  | 
|  | UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) { | 
|  | UnicodeString s; | 
|  | for(int32_t i = 0; i < length; ++i) { | 
|  | if(i > 0) { s.append((UChar)0x20); } | 
|  | uint8_t b = p[i]; | 
|  | if(b == 0) { | 
|  | s.append((UChar)0x2e);  // period | 
|  | } else if(b == 1) { | 
|  | s.append((UChar)0x7c);  // vertical bar | 
|  | } else { | 
|  | appendHex(b, 2, s); | 
|  | } | 
|  | } | 
|  | return s; | 
|  | } | 
|  |  | 
|  | UnicodeString CollationTest::printCollationKey(const CollationKey &key) { | 
|  | int32_t length; | 
|  | const uint8_t *p = key.getByteArray(length); | 
|  | return printSortKey(p, length); | 
|  | } | 
|  |  | 
|  | UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) { | 
|  | for(;;) { | 
|  | int32_t lineLength; | 
|  | const UChar *line = ucbuf_readline(f, &lineLength, errorCode); | 
|  | if(line == NULL || errorCode.isFailure()) { | 
|  | fileLine.remove(); | 
|  | return FALSE; | 
|  | } | 
|  | ++fileLineNumber; | 
|  | // Strip trailing CR/LF, comments, and spaces. | 
|  | const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#' | 
|  | if(comment != NULL) { | 
|  | lineLength = (int32_t)(comment - line); | 
|  | } else { | 
|  | while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; } | 
|  | } | 
|  | while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; } | 
|  | if(lineLength != 0) { | 
|  | fileLine.setTo(FALSE, line, lineLength); | 
|  | return TRUE; | 
|  | } | 
|  | // Empty line, continue. | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, | 
|  | UErrorCode &errorCode) { | 
|  | int32_t length = fileLine.length(); | 
|  | int32_t i; | 
|  | for(i = start; i < length && !isSpace(fileLine[i]); ++i) {} | 
|  | int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|' | 
|  | if(pipeIndex >= 0) { | 
|  | prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape(); | 
|  | if(prefix.isEmpty()) { | 
|  | errln("empty prefix on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode = U_PARSE_ERROR; | 
|  | return; | 
|  | } | 
|  | start = pipeIndex + 1; | 
|  | } else { | 
|  | prefix.remove(); | 
|  | } | 
|  | s = fileLine.tempSubStringBetween(start, i).unescape(); | 
|  | if(s.isEmpty()) { | 
|  | errln("empty string on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode = U_PARSE_ERROR; | 
|  | return; | 
|  | } | 
|  | start = i; | 
|  | } | 
|  |  | 
|  | Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) { | 
|  | Collation::Level relation; | 
|  | int32_t start; | 
|  | if(fileLine[0] == 0x3c) {  // < | 
|  | UChar second = fileLine[1]; | 
|  | start = 2; | 
|  | switch(second) { | 
|  | case 0x31:  // <1 | 
|  | relation = Collation::PRIMARY_LEVEL; | 
|  | break; | 
|  | case 0x32:  // <2 | 
|  | relation = Collation::SECONDARY_LEVEL; | 
|  | break; | 
|  | case 0x33:  // <3 | 
|  | relation = Collation::TERTIARY_LEVEL; | 
|  | break; | 
|  | case 0x34:  // <4 | 
|  | relation = Collation::QUATERNARY_LEVEL; | 
|  | break; | 
|  | case 0x63:  // <c | 
|  | relation = Collation::CASE_LEVEL; | 
|  | break; | 
|  | case 0x69:  // <i | 
|  | relation = Collation::IDENTICAL_LEVEL; | 
|  | break; | 
|  | default:  // just < | 
|  | relation = Collation::NO_LEVEL; | 
|  | start = 1; | 
|  | break; | 
|  | } | 
|  | } else if(fileLine[0] == 0x3d) {  // = | 
|  | relation = Collation::ZERO_LEVEL; | 
|  | start = 1; | 
|  | } else { | 
|  | start = 0; | 
|  | } | 
|  | if(start == 0 || !isSpace(fileLine[start])) { | 
|  | errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return Collation::NO_LEVEL; | 
|  | } | 
|  | start = skipSpaces(start); | 
|  | UnicodeString prefix; | 
|  | parseString(start, prefix, s, errorCode); | 
|  | if(errorCode.isSuccess() && !prefix.isEmpty()) { | 
|  | errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return Collation::NO_LEVEL; | 
|  | } | 
|  | if(start < fileLine.length()) { | 
|  | errln("unexpected line contents after test string on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return Collation::NO_LEVEL; | 
|  | } | 
|  | return relation; | 
|  | } | 
|  |  | 
|  | static const struct { | 
|  | const char *name; | 
|  | UColAttribute attr; | 
|  | } attributes[] = { | 
|  | { "backwards", UCOL_FRENCH_COLLATION }, | 
|  | { "alternate", UCOL_ALTERNATE_HANDLING }, | 
|  | { "caseFirst", UCOL_CASE_FIRST }, | 
|  | { "caseLevel", UCOL_CASE_LEVEL }, | 
|  | // UCOL_NORMALIZATION_MODE is turned on and off automatically. | 
|  | { "strength", UCOL_STRENGTH }, | 
|  | // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated. | 
|  | { "numeric", UCOL_NUMERIC_COLLATION } | 
|  | }; | 
|  |  | 
|  | static const struct { | 
|  | const char *name; | 
|  | UColAttributeValue value; | 
|  | } attributeValues[] = { | 
|  | { "default", UCOL_DEFAULT }, | 
|  | { "primary", UCOL_PRIMARY }, | 
|  | { "secondary", UCOL_SECONDARY }, | 
|  | { "tertiary", UCOL_TERTIARY }, | 
|  | { "quaternary", UCOL_QUATERNARY }, | 
|  | { "identical", UCOL_IDENTICAL }, | 
|  | { "off", UCOL_OFF }, | 
|  | { "on", UCOL_ON }, | 
|  | { "shifted", UCOL_SHIFTED }, | 
|  | { "non-ignorable", UCOL_NON_IGNORABLE }, | 
|  | { "lower", UCOL_LOWER_FIRST }, | 
|  | { "upper", UCOL_UPPER_FIRST } | 
|  | }; | 
|  |  | 
|  | void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) { | 
|  | // Parse attributes even if the Collator could not be created, | 
|  | // in order to report syntax errors. | 
|  | int32_t start = skipSpaces(1); | 
|  | int32_t equalPos = fileLine.indexOf(0x3d); | 
|  | if(equalPos < 0) { | 
|  | if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) { | 
|  | parseAndSetReorderCodes(start + 7, errorCode); | 
|  | return; | 
|  | } | 
|  | errln("missing '=' on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return; | 
|  | } | 
|  |  | 
|  | UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos); | 
|  | UnicodeString valueString = fileLine.tempSubString(equalPos+1); | 
|  | if(attrString == UNICODE_STRING("maxVariable", 11)) { | 
|  | UColReorderCode max; | 
|  | if(valueString == UNICODE_STRING("space", 5)) { | 
|  | max = UCOL_REORDER_CODE_SPACE; | 
|  | } else if(valueString == UNICODE_STRING("punct", 5)) { | 
|  | max = UCOL_REORDER_CODE_PUNCTUATION; | 
|  | } else if(valueString == UNICODE_STRING("symbol", 6)) { | 
|  | max = UCOL_REORDER_CODE_SYMBOL; | 
|  | } else if(valueString == UNICODE_STRING("currency", 8)) { | 
|  | max = UCOL_REORDER_CODE_CURRENCY; | 
|  | } else { | 
|  | errln("invalid attribute value name on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return; | 
|  | } | 
|  | if(coll != NULL) { | 
|  | coll->setMaxVariable(max, errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | errln("setMaxVariable() failed on line %d: %s", | 
|  | (int)fileLineNumber, errorCode.errorName()); | 
|  | infoln(fileLine); | 
|  | return; | 
|  | } | 
|  | } | 
|  | fileLine.remove(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | UColAttribute attr; | 
|  | for(int32_t i = 0;; ++i) { | 
|  | if(i == UPRV_LENGTHOF(attributes)) { | 
|  | errln("invalid attribute name on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return; | 
|  | } | 
|  | if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) { | 
|  | attr = attributes[i].attr; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | UColAttributeValue value; | 
|  | for(int32_t i = 0;; ++i) { | 
|  | if(i == UPRV_LENGTHOF(attributeValues)) { | 
|  | errln("invalid attribute value name on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return; | 
|  | } | 
|  | if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) { | 
|  | value = attributeValues[i].value; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if(coll != NULL) { | 
|  | coll->setAttribute(attr, value, errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | errln("illegal attribute=value combination on line %d: %s", | 
|  | (int)fileLineNumber, errorCode.errorName()); | 
|  | infoln(fileLine); | 
|  | return; | 
|  | } | 
|  | } | 
|  | fileLine.remove(); | 
|  | } | 
|  |  | 
|  | void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) { | 
|  | UVector32 reorderCodes(errorCode); | 
|  | while(start < fileLine.length()) { | 
|  | start = skipSpaces(start); | 
|  | int32_t limit = start; | 
|  | while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; } | 
|  | CharString name; | 
|  | name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode); | 
|  | int32_t code = CollationRuleParser::getReorderCode(name.data()); | 
|  | if(code < 0) { | 
|  | if(uprv_stricmp(name.data(), "default") == 0) { | 
|  | code = UCOL_REORDER_CODE_DEFAULT;  // -1 | 
|  | } else { | 
|  | errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | errorCode.set(U_PARSE_ERROR); | 
|  | return; | 
|  | } | 
|  | } | 
|  | reorderCodes.addElement(code, errorCode); | 
|  | start = limit; | 
|  | } | 
|  | if(coll != NULL) { | 
|  | coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | errln("setReorderCodes() failed on line %d: %s", | 
|  | (int)fileLineNumber, errorCode.errorName()); | 
|  | infoln(fileLine); | 
|  | return; | 
|  | } | 
|  | } | 
|  | fileLine.remove(); | 
|  | } | 
|  |  | 
|  | void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) { | 
|  | UnicodeString rules; | 
|  | while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) { | 
|  | rules.append(fileLine.unescape()); | 
|  | } | 
|  | if(errorCode.isFailure()) { return; } | 
|  | logln(rules); | 
|  |  | 
|  | UParseError parseError; | 
|  | UnicodeString reason; | 
|  | delete coll; | 
|  | coll = new RuleBasedCollator(rules, parseError, reason, errorCode); | 
|  | if(coll == NULL) { | 
|  | errln("unable to allocate a new collator"); | 
|  | errorCode.set(U_MEMORY_ALLOCATION_ERROR); | 
|  | return; | 
|  | } | 
|  | if(errorCode.isFailure()) { | 
|  | dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName()); | 
|  | infoln(UnicodeString("  reason: ") + reason); | 
|  | if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); } | 
|  | if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) { | 
|  | infoln(UnicodeString("  snippet: ...") + | 
|  | parseError.preContext + "(!)" + parseError.postContext + "..."); | 
|  | } | 
|  | delete coll; | 
|  | coll = NULL; | 
|  | errorCode.reset(); | 
|  | } else { | 
|  | assertEquals("no error reason when RuleBasedCollator(rules) succeeds", | 
|  | UnicodeString(), reason); | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) { | 
|  | if(errorCode.isFailure()) { return; } | 
|  | delete coll; | 
|  | coll = Collator::createInstance(Locale::getRoot(), errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | dataerrln("unable to create a root collator"); | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) { | 
|  | if(errorCode.isFailure()) { return; } | 
|  | delete coll; | 
|  | coll = NULL; | 
|  | int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant | 
|  | if(at >= 0) { | 
|  | fileLine.setCharAt(at, (UChar)0x2a);  // * | 
|  | } | 
|  | CharString localeID; | 
|  | localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode); | 
|  | if(at >= 0) { | 
|  | localeID.data()[at - 9] = '@'; | 
|  | } | 
|  | Locale locale(localeID.data()); | 
|  | if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) { | 
|  | errln("invalid language tag on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); } | 
|  | return; | 
|  | } | 
|  |  | 
|  | logln("creating a collator for locale ID %s", locale.getName()); | 
|  | coll = Collator::createInstance(locale, errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | dataerrln("unable to create a collator for locale %s on line %d", | 
|  | locale.getName(), (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | delete coll; | 
|  | coll = NULL; | 
|  | errorCode.reset(); | 
|  | } | 
|  | } | 
|  |  | 
|  | UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const { | 
|  | if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; } | 
|  | // In some sequences with Tibetan composite vowel signs, | 
|  | // even if the string passes the FCD check, | 
|  | // those composites must be decomposed. | 
|  | // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81. | 
|  | int32_t index = 0; | 
|  | while((index = s.indexOf((UChar)0xf71, index)) >= 0) { | 
|  | if(++index < s.length()) { | 
|  | UChar c = s[index]; | 
|  | if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; } | 
|  | } | 
|  | } | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length, | 
|  | CharString &dest, int32_t partSize, | 
|  | IcuTestErrorCode &errorCode) { | 
|  | if(errorCode.isFailure()) { return FALSE; } | 
|  | uint8_t part[32]; | 
|  | U_ASSERT(partSize <= UPRV_LENGTHOF(part)); | 
|  | UCharIterator iter; | 
|  | uiter_setString(&iter, s, length); | 
|  | uint32_t state[2] = { 0, 0 }; | 
|  | for(;;) { | 
|  | int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode); | 
|  | UBool done = partLength < partSize; | 
|  | if(done) { | 
|  | // At the end, append the next byte as well which should be 00. | 
|  | ++partLength; | 
|  | } | 
|  | dest.append(reinterpret_cast<char *>(part), partLength, errorCode); | 
|  | if(done) { | 
|  | return errorCode.isSuccess(); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line, | 
|  | const UChar *s, int32_t length, | 
|  | CollationKey &key, IcuTestErrorCode &errorCode) { | 
|  | if(errorCode.isFailure()) { return FALSE; } | 
|  | coll->getCollationKey(s, length, key, errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("Collator(%s).getCollationKey() failed: %s", | 
|  | norm, errorCode.errorName()); | 
|  | infoln(line); | 
|  | return FALSE; | 
|  | } | 
|  | int32_t keyLength; | 
|  | const uint8_t *keyBytes = key.getByteArray(keyLength); | 
|  | if(keyLength == 0 || keyBytes[keyLength - 1] != 0) { | 
|  | infoln(fileTestName); | 
|  | errln("Collator(%s).getCollationKey() wrote an empty or unterminated key", | 
|  | norm); | 
|  | infoln(line); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode); | 
|  | if(numLevels < UCOL_IDENTICAL) { | 
|  | ++numLevels; | 
|  | } else { | 
|  | numLevels = 5; | 
|  | } | 
|  | if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) { | 
|  | ++numLevels; | 
|  | } | 
|  | errorCode.assertSuccess(); | 
|  | int32_t numLevelSeparators = 0; | 
|  | for(int32_t i = 0; i < (keyLength - 1); ++i) { | 
|  | uint8_t b = keyBytes[i]; | 
|  | if(b == 0) { | 
|  | infoln(fileTestName); | 
|  | errln("Collator(%s).getCollationKey() contains a 00 byte", norm); | 
|  | infoln(line); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | if(b == 1) { ++numLevelSeparators; } | 
|  | } | 
|  | if(numLevelSeparators != (numLevels - 1)) { | 
|  | infoln(fileTestName); | 
|  | errln("Collator(%s).getCollationKey() has %d level separators for %d levels", | 
|  | norm, (int)numLevelSeparators, (int)numLevels); | 
|  | infoln(line); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | // Check that internalNextSortKeyPart() makes the same key, with several part sizes. | 
|  | static const int32_t partSizes[] = { 32, 3, 1 }; | 
|  | for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) { | 
|  | int32_t partSize = partSizes[psi]; | 
|  | CharString parts; | 
|  | if(!getSortKeyParts(s, length, parts, 32, errorCode)) { | 
|  | infoln(fileTestName); | 
|  | errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s", | 
|  | norm, (int)partSize, errorCode.errorName()); | 
|  | infoln(line); | 
|  | return FALSE; | 
|  | } | 
|  | if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) { | 
|  | infoln(fileTestName); | 
|  | errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)", | 
|  | norm, (int)partSize); | 
|  | infoln(line); | 
|  | infoln(printCollationKey(key)); | 
|  | infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length())); | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Changes the key to the merged segments of the U+FFFE-separated substrings of s. | 
|  | * Leaves key unchanged if s does not contain U+FFFE. | 
|  | * @return TRUE if the key was successfully changed | 
|  | */ | 
|  | UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length, | 
|  | CollationKey &key, IcuTestErrorCode &errorCode) { | 
|  | if(errorCode.isFailure()) { return FALSE; } | 
|  | LocalMemory<uint8_t> mergedKey; | 
|  | int32_t mergedKeyLength = 0; | 
|  | int32_t mergedKeyCapacity = 0; | 
|  | int32_t sLength = (length >= 0) ? length : u_strlen(s); | 
|  | int32_t segmentStart = 0; | 
|  | for(int32_t i = 0;;) { | 
|  | if(i == sLength) { | 
|  | if(segmentStart == 0) { | 
|  | // s does not contain any U+FFFE. | 
|  | return FALSE; | 
|  | } | 
|  | } else if(s[i] != 0xfffe) { | 
|  | ++i; | 
|  | continue; | 
|  | } | 
|  | // Get the sort key for another segment and merge it into mergedKey. | 
|  | CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes | 
|  | CollationKey key2; | 
|  | coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode); | 
|  | int32_t key1Length, key2Length; | 
|  | const uint8_t *key1Bytes = key1.getByteArray(key1Length); | 
|  | const uint8_t *key2Bytes = key2.getByteArray(key2Length); | 
|  | uint8_t *dest; | 
|  | int32_t minCapacity = key1Length + key2Length; | 
|  | if(key1Length > 0) { --minCapacity; } | 
|  | if(minCapacity <= mergedKeyCapacity) { | 
|  | dest = mergedKey.getAlias(); | 
|  | } else { | 
|  | if(minCapacity <= 200) { | 
|  | mergedKeyCapacity = 200; | 
|  | } else if(minCapacity <= 2 * mergedKeyCapacity) { | 
|  | mergedKeyCapacity *= 2; | 
|  | } else { | 
|  | mergedKeyCapacity = minCapacity; | 
|  | } | 
|  | dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity); | 
|  | } | 
|  | U_ASSERT(dest != NULL || mergedKeyCapacity == 0); | 
|  | if(key1Length == 0) { | 
|  | // key2 is the sort key for the first segment. | 
|  | uprv_memcpy(dest, key2Bytes, key2Length); | 
|  | mergedKeyLength = key2Length; | 
|  | } else { | 
|  | mergedKeyLength = | 
|  | ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length, | 
|  | dest, mergedKeyCapacity); | 
|  | } | 
|  | if(i == sLength) { break; } | 
|  | segmentStart = ++i; | 
|  | } | 
|  | key = CollationKey(mergedKey.getAlias(), mergedKeyLength); | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | /** | 
|  | * Replaces unpaired surrogates with U+FFFD. | 
|  | * Returns s if no replacement was made, otherwise buffer. | 
|  | */ | 
|  | const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) { | 
|  | int32_t i = 0; | 
|  | while(i < s.length()) { | 
|  | UChar32 c = s.char32At(i); | 
|  | if(U_IS_SURROGATE(c)) { | 
|  | if(buffer.length() < i) { | 
|  | buffer.append(s, buffer.length(), i - buffer.length()); | 
|  | } | 
|  | buffer.append((UChar)0xfffd); | 
|  | } | 
|  | i += U16_LENGTH(c); | 
|  | } | 
|  | if(buffer.isEmpty()) { | 
|  | return s; | 
|  | } | 
|  | if(buffer.length() < i) { | 
|  | buffer.append(s, buffer.length(), i - buffer.length()); | 
|  | } | 
|  | return buffer; | 
|  | } | 
|  |  | 
|  | int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key, | 
|  | UCollationResult order, UBool collHasCaseLevel) { | 
|  | if(order == UCOL_EQUAL) { | 
|  | return Collation::NO_LEVEL; | 
|  | } | 
|  | int32_t prevKeyLength; | 
|  | const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength); | 
|  | int32_t keyLength; | 
|  | const uint8_t *bytes = key.getByteArray(keyLength); | 
|  | int32_t level = Collation::PRIMARY_LEVEL; | 
|  | for(int32_t i = 0;; ++i) { | 
|  | uint8_t b = prevBytes[i]; | 
|  | if(b != bytes[i]) { break; } | 
|  | if(b == Collation::LEVEL_SEPARATOR_BYTE) { | 
|  | ++level; | 
|  | if(level == Collation::CASE_LEVEL && !collHasCaseLevel) { | 
|  | ++level; | 
|  | } | 
|  | } | 
|  | } | 
|  | return level; | 
|  | } | 
|  |  | 
|  | } | 
|  |  | 
|  | UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, | 
|  | const UnicodeString &prevString, const UnicodeString &s, | 
|  | UCollationResult expectedOrder, Collation::Level expectedLevel, | 
|  | IcuTestErrorCode &errorCode) { | 
|  | if(errorCode.isFailure()) { return FALSE; } | 
|  |  | 
|  | // Get the sort keys first, for error debug output. | 
|  | CollationKey prevKey; | 
|  | if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(), | 
|  | prevKey, errorCode)) { | 
|  | return FALSE; | 
|  | } | 
|  | CollationKey key; | 
|  | if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; } | 
|  |  | 
|  | UCollationResult order = coll->compare(prevString, s, errorCode); | 
|  | if(order != expectedOrder || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | order = coll->compare(s, prevString, errorCode); | 
|  | if(order != -expectedOrder || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | // Test NUL-termination if the strings do not contain NUL characters. | 
|  | UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0; | 
|  | if(!containNUL) { | 
|  | order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode); | 
|  | if(order != expectedOrder || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode); | 
|  | if(order != -expectedOrder || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  |  | 
|  | #if U_HAVE_STD_STRING | 
|  | // compare(UTF-16) treats unpaired surrogates like unassigned code points. | 
|  | // Unpaired surrogates cannot be converted to UTF-8. | 
|  | // Create valid UTF-16 strings if necessary, and use those for | 
|  | // both the expected compare() result and for the input to compare(UTF-8). | 
|  | UnicodeString prevBuffer, sBuffer; | 
|  | const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer); | 
|  | const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer); | 
|  | std::string prevUTF8, sUTF8; | 
|  | UnicodeString(prevValid).toUTF8String(prevUTF8); | 
|  | UnicodeString(sValid).toUTF8String(sUTF8); | 
|  | UCollationResult expectedUTF8Order; | 
|  | if(&prevValid == &prevString && &sValid == &s) { | 
|  | expectedUTF8Order = expectedOrder; | 
|  | } else { | 
|  | expectedUTF8Order = coll->compare(prevValid, sValid, errorCode); | 
|  | } | 
|  |  | 
|  | order = coll->compareUTF8(prevUTF8, sUTF8, errorCode); | 
|  | if(order != expectedUTF8Order || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | order = coll->compareUTF8(sUTF8, prevUTF8, errorCode); | 
|  | if(order != -expectedUTF8Order || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | // Test NUL-termination if the strings do not contain NUL characters. | 
|  | if(!containNUL) { | 
|  | order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode); | 
|  | if(order != expectedUTF8Order || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode); | 
|  | if(order != -expectedUTF8Order || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | #endif | 
|  |  | 
|  | UCharIterator leftIter; | 
|  | UCharIterator rightIter; | 
|  | uiter_setString(&leftIter, prevString.getBuffer(), prevString.length()); | 
|  | uiter_setString(&rightIter, s.getBuffer(), s.length()); | 
|  | order = coll->compare(leftIter, rightIter, errorCode); | 
|  | if(order != expectedOrder || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).compare(UCharIterator: previous, current) " | 
|  | "wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  |  | 
|  | order = prevKey.compareTo(key, errorCode); | 
|  | if(order != expectedOrder || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON; | 
|  | int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); | 
|  | if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { | 
|  | if(level != expectedLevel) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d", | 
|  | (int)fileLineNumber, norm, order, level, expectedLevel); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  |  | 
|  | // If either string contains U+FFFE, then their sort keys must compare the same as | 
|  | // the merged sort keys of each string's between-FFFE segments. | 
|  | // | 
|  | // It is not required that | 
|  | //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2)) | 
|  | // only that those two methods yield the same order. | 
|  | // | 
|  | // Use bit-wise OR so that getMergedCollationKey() is always called for both strings. | 
|  | if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) | | 
|  | getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) || | 
|  | errorCode.isFailure()) { | 
|  | order = prevKey.compareTo(key, errorCode); | 
|  | if(order != expectedOrder || errorCode.isFailure()) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey" | 
|  | "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)", | 
|  | (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); | 
|  | if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { | 
|  | if(mergedLevel != level) { | 
|  | infoln(fileTestName); | 
|  | errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey" | 
|  | "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d", | 
|  | (int)fileLineNumber, norm, order, mergedLevel, level); | 
|  | infoln(prevFileLine); | 
|  | infoln(fileLine); | 
|  | infoln(printCollationKey(prevKey)); | 
|  | infoln(printCollationKey(key)); | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  | } | 
|  | return TRUE; | 
|  | } | 
|  |  | 
|  | void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) { | 
|  | if(errorCode.isFailure()) { return; } | 
|  | UnicodeString prevFileLine = UNICODE_STRING("(none)", 6); | 
|  | UnicodeString prevString, s; | 
|  | prevString.getTerminatedBuffer();  // Ensure NUL-termination. | 
|  | while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) { | 
|  | // Parse the line even if it will be ignored (when we do not have a Collator) | 
|  | // in order to report syntax issues. | 
|  | Collation::Level relation = parseRelationAndString(s, errorCode); | 
|  | if(errorCode.isFailure()) { | 
|  | errorCode.reset(); | 
|  | break; | 
|  | } | 
|  | if(coll == NULL) { | 
|  | // We were unable to create the Collator but continue with tests. | 
|  | // Ignore test data for this Collator. | 
|  | // The next Collator creation might work. | 
|  | continue; | 
|  | } | 
|  | UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS; | 
|  | Collation::Level expectedLevel = relation; | 
|  | s.getTerminatedBuffer();  // Ensure NUL-termination. | 
|  | UBool isOk = TRUE; | 
|  | if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) { | 
|  | coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode); | 
|  | isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s, | 
|  | expectedOrder, expectedLevel, errorCode); | 
|  | } | 
|  | if(isOk) { | 
|  | coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode); | 
|  | isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s, | 
|  | expectedOrder, expectedLevel, errorCode); | 
|  | } | 
|  | if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) { | 
|  | UnicodeString pn = nfd->normalize(prevString, errorCode); | 
|  | UnicodeString n = nfd->normalize(s, errorCode); | 
|  | pn.getTerminatedBuffer(); | 
|  | n.getTerminatedBuffer(); | 
|  | errorCode.assertSuccess(); | 
|  | isOk = checkCompareTwo("NFD input", prevFileLine, pn, n, | 
|  | expectedOrder, expectedLevel, errorCode); | 
|  | } | 
|  | if(!isOk) { | 
|  | errorCode.reset();  // already reported | 
|  | } | 
|  | prevFileLine = fileLine; | 
|  | prevString = s; | 
|  | prevString.getTerminatedBuffer();  // Ensure NUL-termination. | 
|  | } | 
|  | } | 
|  |  | 
|  | void CollationTest::TestDataDriven() { | 
|  | IcuTestErrorCode errorCode(*this, "TestDataDriven"); | 
|  |  | 
|  | fcd = Normalizer2Factory::getFCDInstance(errorCode); | 
|  | nfd = Normalizer2::getNFDInstance(errorCode); | 
|  | if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | CharString path(getSourceTestData(errorCode), errorCode); | 
|  | path.appendPathPart("collationtest.txt", errorCode); | 
|  | const char *codePage = "UTF-8"; | 
|  | LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode)); | 
|  | if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) { | 
|  | return; | 
|  | } | 
|  | // Read a new line if necessary. | 
|  | // Sub-parsers leave the first line set that they do not handle. | 
|  | while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) { | 
|  | if(!isSectionStarter(fileLine[0])) { | 
|  | errln("syntax error on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | return; | 
|  | } | 
|  | if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) { | 
|  | fileTestName = fileLine; | 
|  | logln(fileLine); | 
|  | fileLine.remove(); | 
|  | } else if(fileLine == UNICODE_STRING("@ root", 6)) { | 
|  | setRootCollator(errorCode); | 
|  | fileLine.remove(); | 
|  | } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) { | 
|  | setLocaleCollator(errorCode); | 
|  | fileLine.remove(); | 
|  | } else if(fileLine == UNICODE_STRING("@ rules", 7)) { | 
|  | buildTailoring(f.getAlias(), errorCode); | 
|  | } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // % | 
|  | parseAndSetAttribute(errorCode); | 
|  | } else if(fileLine == UNICODE_STRING("* compare", 9)) { | 
|  | checkCompareStrings(f.getAlias(), errorCode); | 
|  | } else { | 
|  | errln("syntax error on line %d", (int)fileLineNumber); | 
|  | infoln(fileLine); | 
|  | return; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | #endif  // !UCONFIG_NO_COLLATION |