blob: 851bc7f85e99863cc4590e20e981df11ebb9cf83 [file] [log] [blame]
Index: source/test/cintltst/usrchtst.c
===================================================================
--- source/test/cintltst/usrchtst.c (revision 75773)
+++ source/test/cintltst/usrchtst.c (working copy)
@@ -1,5 +1,5 @@
/********************************************************************
- * Copyright (c) 2001-2010 International Business Machines
+ * Copyright (c) 2001-2011 International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************
* File usrchtst.c
@@ -2553,7 +2553,173 @@
ucol_close(coll);
}
+/**
+* TestUsingSearchCollator
+*/
+#define ARRAY_LENGTH(array) (sizeof(array)/sizeof(array[0]))
+
+typedef struct {
+ const UChar * pattern;
+ const int32_t * offsets;
+ int32_t offsetsLen;
+} PatternAndOffsets;
+
+static const UChar scKoText[] = {
+ 0x0020,
+/*01*/ 0xAC00, 0x0020, /* simple LV Hangul */
+/*03*/ 0xAC01, 0x0020, /* simple LVT Hangul */
+/*05*/ 0xAC0F, 0x0020, /* LVTT, last jamo expands for search */
+/*07*/ 0xAFFF, 0x0020, /* LLVVVTT, every jamo expands for search */
+/*09*/ 0x1100, 0x1161, 0x11A8, 0x0020, /* 0xAC01 as conjoining jamo */
+/*13*/ 0x1100, 0x1161, 0x1100, 0x0020, /* 0xAC01 as basic conjoining jamo (per search rules) */
+/*17*/ 0x3131, 0x314F, 0x3131, 0x0020, /* 0xAC01 as compatibility jamo */
+/*21*/ 0x1100, 0x1161, 0x11B6, 0x0020, /* 0xAC0F as conjoining jamo; last expands for search */
+/*25*/ 0x1100, 0x1161, 0x1105, 0x1112, 0x0020, /* 0xAC0F as basic conjoining jamo; last expands for search */
+/*30*/ 0x1101, 0x1170, 0x11B6, 0x0020, /* 0xAFFF as conjoining jamo; all expand for search */
+/*34*/ 0x00E6, 0x0020, /* small letter ae, expands */
+/*36*/ 0x1E4D, 0x0020, /* small letter o with tilde and acute, decomposes */
+ 0
+};
+
+static const UChar scKoPat0[] = { 0xAC01, 0 };
+static const UChar scKoPat1[] = { 0x1100, 0x1161, 0x11A8, 0 }; /* 0xAC01 as conjoining jamo */
+static const UChar scKoPat2[] = { 0xAC0F, 0 };
+static const UChar scKoPat3[] = { 0x1100, 0x1161, 0x1105, 0x1112, 0 }; /* 0xAC0F as basic conjoining jamo */
+static const UChar scKoPat4[] = { 0xAFFF, 0 };
+static const UChar scKoPat5[] = { 0x1101, 0x1170, 0x11B6, 0 }; /* 0xAFFF as conjoining jamo */
+
+static const int32_t scKoSrchOff01[] = { 3, 9, 13 };
+static const int32_t scKoSrchOff23[] = { 5, 21, 25 };
+static const int32_t scKoSrchOff45[] = { 7, 30 };
+
+static const PatternAndOffsets scKoSrchPatternsOffsets[] = {
+ { scKoPat0, scKoSrchOff01, ARRAY_LENGTH(scKoSrchOff01) },
+ { scKoPat1, scKoSrchOff01, ARRAY_LENGTH(scKoSrchOff01) },
+ { scKoPat2, scKoSrchOff23, ARRAY_LENGTH(scKoSrchOff23) },
+ { scKoPat3, scKoSrchOff23, ARRAY_LENGTH(scKoSrchOff23) },
+ { scKoPat4, scKoSrchOff45, ARRAY_LENGTH(scKoSrchOff45) },
+ { scKoPat5, scKoSrchOff45, ARRAY_LENGTH(scKoSrchOff45) },
+ { NULL, NULL, 0 }
+};
+
+static const int32_t scKoStndOff01[] = { 3, 9 };
+static const int32_t scKoStndOff2[] = { 5, 21 };
+static const int32_t scKoStndOff3[] = { 25 };
+static const int32_t scKoStndOff45[] = { 7, 30 };
+
+static const PatternAndOffsets scKoStndPatternsOffsets[] = {
+ { scKoPat0, scKoStndOff01, ARRAY_LENGTH(scKoStndOff01) },
+ { scKoPat1, scKoStndOff01, ARRAY_LENGTH(scKoStndOff01) },
+ { scKoPat2, scKoStndOff2, ARRAY_LENGTH(scKoStndOff2) },
+ { scKoPat3, scKoStndOff3, ARRAY_LENGTH(scKoStndOff3) },
+ { scKoPat4, scKoStndOff45, ARRAY_LENGTH(scKoStndOff45) },
+ { scKoPat5, scKoStndOff45, ARRAY_LENGTH(scKoStndOff45) },
+ { NULL, NULL, 0 }
+};
+
+typedef struct {
+ const char * locale;
+ const UChar * text;
+ const PatternAndOffsets * patternsAndOffsets;
+} TUSCItem;
+
+static const TUSCItem tuscItems[] = {
+ { "root", scKoText, scKoStndPatternsOffsets },
+ { "root@collation=search", scKoText, scKoSrchPatternsOffsets },
+ { "ko@collation=search", scKoText, scKoSrchPatternsOffsets },
+ { NULL, NULL, NULL }
+};
+
+static const UChar dummyPat[] = { 0x0061, 0 };
+
+static void TestUsingSearchCollator(void)
+{
+ const TUSCItem * tuscItemPtr;
+ for (tuscItemPtr = tuscItems; tuscItemPtr->locale != NULL; tuscItemPtr++) {
+ UErrorCode status = U_ZERO_ERROR;
+ UCollator* ucol = ucol_open(tuscItemPtr->locale, &status);
+ if ( U_SUCCESS(status) ) {
+ UStringSearch* usrch = usearch_openFromCollator(dummyPat, -1, tuscItemPtr->text, -1, ucol, NULL, &status);
+ if ( U_SUCCESS(status) ) {
+ const PatternAndOffsets * patternsOffsetsPtr;
+ for ( patternsOffsetsPtr = tuscItemPtr->patternsAndOffsets; patternsOffsetsPtr->pattern != NULL; patternsOffsetsPtr++) {
+ usearch_setPattern(usrch, patternsOffsetsPtr->pattern, -1, &status);
+ if ( U_SUCCESS(status) ) {
+ int32_t offset;
+ const int32_t * nextOffsetPtr;
+ const int32_t * limitOffsetPtr;
+
+ usearch_reset(usrch);
+ nextOffsetPtr = patternsOffsetsPtr->offsets;
+ limitOffsetPtr = patternsOffsetsPtr->offsets + patternsOffsetsPtr->offsetsLen;
+ while (TRUE) {
+ offset = usearch_next(usrch, &status);
+ if ( U_FAILURE(status) || offset == USEARCH_DONE ) {
+ break;
+ }
+ if ( nextOffsetPtr < limitOffsetPtr ) {
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected usearch_next %d, got %d\n", tuscItemPtr->locale, *nextOffsetPtr, offset);
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ nextOffsetPtr++;
+ } else {
+ log_err("error, locale %s, usearch_next returned more matches than expected\n", tuscItemPtr->locale );
+ }
+ }
+ if ( U_FAILURE(status) ) {
+ log_err("error, locale %s, usearch_next failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+ } else if ( nextOffsetPtr < limitOffsetPtr ) {
+ log_err("error, locale %s, usearch_next returned fewer matches than expected\n", tuscItemPtr->locale );
+ }
+
+ status = U_ZERO_ERROR;
+ usearch_reset(usrch);
+ nextOffsetPtr = patternsOffsetsPtr->offsets + patternsOffsetsPtr->offsetsLen;
+ limitOffsetPtr = patternsOffsetsPtr->offsets;
+ while (TRUE) {
+ offset = usearch_previous(usrch, &status);
+ if ( U_FAILURE(status) || offset == USEARCH_DONE ) {
+ break;
+ }
+ if ( nextOffsetPtr > limitOffsetPtr ) {
+ nextOffsetPtr--;
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected usearch_previous %d, got %d\n", tuscItemPtr->locale, *nextOffsetPtr, offset);
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ } else {
+ log_err("error, locale %s, usearch_previous returned more matches than expected\n", tuscItemPtr->locale );
+ }
+ }
+ if ( U_FAILURE(status) ) {
+ log_err("error, locale %s, usearch_previous failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+ } else if ( nextOffsetPtr > limitOffsetPtr ) {
+ log_err("error, locale %s, usearch_previous returned fewer matches than expected\n", tuscItemPtr->locale );
+ }
+
+ } else {
+ log_err("error, locale %s, usearch_setPattern failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+ }
+ }
+ usearch_close(usrch);
+ } else {
+ log_err("error, locale %s, usearch_openFromCollator failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+ }
+ ucol_close(ucol);
+ } else {
+ log_err("error, locale %s, ucol_open failed: %s\n", tuscItemPtr->locale, u_errorName(status) );
+ }
+ }
+}
+
+/**
+* addSearchTest
+*/
+
void addSearchTest(TestNode** root)
{
addTest(root, &TestStart, "tscoll/usrchtst/TestStart");
@@ -2608,6 +2774,7 @@
addTest(root, &TestForwardBackward, "tscoll/usrchtst/TestForwardBackward");
addTest(root, &TestSearchForNull, "tscoll/usrchtst/TestSearchForNull");
addTest(root, &TestStrengthIdentical, "tscoll/usrchtst/TestStrengthIdentical");
+ addTest(root, &TestUsingSearchCollator, "tscoll/usrchtst/TestUsingSearchCollator");
}
#endif /* #if !UCONFIG_NO_COLLATION */
Index: source/test/cintltst/citertst.c
===================================================================
--- source/test/cintltst/citertst.c (revision 75773)
+++ source/test/cintltst/citertst.c (working copy)
@@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2010, International Business Machines Corporation and
+ * Copyright (c) 1997-2011, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@@ -22,6 +22,7 @@
#if !UCONFIG_NO_COLLATION
#include "unicode/ucol.h"
+#include "unicode/ucoleitr.h"
#include "unicode/uloc.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
@@ -58,6 +59,7 @@
addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow");
addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity");
addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity");
+ addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollatorElements");
}
/* The locales we support */
@@ -2017,4 +2019,141 @@
T_FileStream_close(file);
}
+/**
+* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
+* normalization on AND jamo tailoring, among other things.
+*/
+static const UChar tsceText[] = { /* Nothing in here should be ignorable */
+ 0x0020, 0xAC00, /* simple LV Hangul */
+ 0x0020, 0xAC01, /* simple LVT Hangul */
+ 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */
+ 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */
+ 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */
+ 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */
+ 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands for search */
+ 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for search */
+ 0x0020, 0x00E6, /* small letter ae, expands */
+ 0x0020, 0x1E4D, /* small letter o with tilde and acute, decomposes */
+ 0x0020
+};
+enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) };
+
+static const int32_t rootStandardOffsets[] = {
+ 0, 1,2,
+ 2, 3,4,4,
+ 4, 5,6,6,
+ 6, 7,8,8,
+ 8, 9,10,11,
+ 12, 13,14,15,
+ 16, 17,18,19,
+ 20, 21,22,23,
+ 24, 25,26,26,26,
+ 26, 27,28,28,
+ 28,
+ 29
+};
+enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandardOffsets[0]) };
+
+static const int32_t rootSearchOffsets[] = {
+ 0, 1,2,
+ 2, 3,4,4,
+ 4, 5,6,6,6,
+ 6, 7,8,8,8,8,8,8,
+ 8, 9,10,11,
+ 12, 13,14,15,
+ 16, 17,18,19,20,
+ 20, 21,22,22,23,23,23,24,
+ 24, 25,26,26,26,
+ 26, 27,28,28,
+ 28,
+ 29
+};
+enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffsets[0]) };
+
+typedef struct {
+ const char * locale;
+ const int32_t * offsets;
+ int32_t offsetsLen;
+} TSCEItem;
+
+static const TSCEItem tsceItems[] = {
+ { "root", rootStandardOffsets, kLen_rootStandardOffsets },
+ { "root@collation=search", rootSearchOffsets, kLen_rootSearchOffsets },
+ { NULL, NULL, 0 }
+};
+
+static void TestSearchCollatorElements(void)
+{
+ const TSCEItem * tsceItemPtr;
+ for (tsceItemPtr = tsceItems; tsceItemPtr->locale != NULL; tsceItemPtr++) {
+ UErrorCode status = U_ZERO_ERROR;
+ UCollator* ucol = ucol_open(tsceItemPtr->locale, &status);
+ if ( U_SUCCESS(status) ) {
+ UCollationElements * uce = ucol_openElements(ucol, tsceText, kLen_tsceText, &status);
+ if ( U_SUCCESS(status) ) {
+ int32_t offset, element;
+ const int32_t * nextOffsetPtr;
+ const int32_t * limitOffsetPtr;
+
+ nextOffsetPtr = tsceItemPtr->offsets;
+ limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+ do {
+ offset = ucol_getOffset(uce);
+ element = ucol_next(uce, &status);
+ if ( element == 0 ) {
+ log_err("error, locale %s, ucol_next returned element 0\n", tsceItemPtr->locale );
+ }
+ if ( nextOffsetPtr < limitOffsetPtr ) {
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected ucol_next -> ucol_getOffset %d, got %d\n",
+ tsceItemPtr->locale, *nextOffsetPtr, offset );
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ nextOffsetPtr++;
+ } else {
+ log_err("error, locale %s, ucol_next returned more elements than expected\n", tsceItemPtr->locale );
+ }
+ } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+ if ( nextOffsetPtr < limitOffsetPtr ) {
+ log_err("error, locale %s, ucol_next returned fewer elements than expected\n", tsceItemPtr->locale );
+ }
+
+ ucol_setOffset(uce, kLen_tsceText, &status);
+ status = U_ZERO_ERROR;
+ nextOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen;
+ limitOffsetPtr = tsceItemPtr->offsets;
+ do {
+ offset = ucol_getOffset(uce);
+ element = ucol_previous(uce, &status);
+ if ( element == 0 ) {
+ log_err("error, locale %s, ucol_previous returned element 0\n", tsceItemPtr->locale );
+ }
+ if ( nextOffsetPtr > limitOffsetPtr ) {
+ nextOffsetPtr--;
+ if (offset != *nextOffsetPtr) {
+ log_err("error, locale %s, expected ucol_previous -> ucol_getOffset %d, got %d\n",
+ tsceItemPtr->locale, *nextOffsetPtr, offset );
+ nextOffsetPtr = limitOffsetPtr;
+ break;
+ }
+ } else {
+ log_err("error, locale %s, ucol_previous returned more elements than expected\n", tsceItemPtr->locale );
+ }
+ } while ( U_SUCCESS(status) && element != UCOL_NULLORDER );
+ if ( nextOffsetPtr > limitOffsetPtr ) {
+ log_err("error, locale %s, ucol_previous returned fewer elements than expected\n", tsceItemPtr->locale );
+ }
+
+ ucol_closeElements(uce);
+ } else {
+ log_err("error, locale %s, ucol_openElements failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
+ }
+ ucol_close(ucol);
+ } else {
+ log_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr->locale, u_errorName(status) );
+ }
+ }
+}
+
#endif /* #if !UCONFIG_NO_COLLATION */
Index: source/test/cintltst/citertst.h
===================================================================
--- source/test/cintltst/citertst.h (revision 75773)
+++ source/test/cintltst/citertst.h (working copy)
@@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
- * Copyright (c) 1997-2008, International Business Machines Corporation and
+ * Copyright (c) 1997-2008,2011, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@@ -101,6 +101,11 @@
* Bound checkings.
*/
static void TestSortKeyValidity(void);
+/**
+* TestSearchCollatorElements tests iterator behavior (forwards and backwards) with
+* normalization on AND jamo tailoring, among other things.
+*/
+static void TestSearchCollatorElements(void);
/*------------------------------------------------------------------------
Internal utilities
Index: source/i18n/ucol.cpp
===================================================================
--- source/i18n/ucol.cpp (revision 75773)
+++ source/i18n/ucol.cpp (working copy)
@@ -1,6 +1,6 @@
/*
*******************************************************************************
-* Copyright (C) 1996-2010, International Business Machines
+* Copyright (C) 1996-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucol.cpp
@@ -1444,173 +1444,176 @@
UChar ch = 0;
collationSource->offsetReturn = NULL;
- for (;;) /* Loop handles case when incremental normalize switches */
- { /* to or from the side buffer / original string, and we */
- /* need to start again to get the next character. */
+ do {
+ for (;;) /* Loop handles case when incremental normalize switches */
+ { /* to or from the side buffer / original string, and we */
+ /* need to start again to get the next character. */
- if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
- {
- // The source string is null terminated and we're not working from the side buffer,
- // and we're not normalizing. This is the fast path.
- // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
- ch = *collationSource->pos++;
- if (ch != 0) {
- break;
+ if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
+ {
+ // The source string is null terminated and we're not working from the side buffer,
+ // and we're not normalizing. This is the fast path.
+ // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
+ ch = *collationSource->pos++;
+ if (ch != 0) {
+ break;
+ }
+ else {
+ return UCOL_NO_MORE_CES;
+ }
}
- else {
- return UCOL_NO_MORE_CES;
- }
- }
- if (collationSource->flags & UCOL_ITER_HASLEN) {
- // Normal path for strings when length is specified.
- // (We can't be in side buffer because it is always null terminated.)
- if (collationSource->pos >= collationSource->endp) {
- // Ran off of the end of the main source string. We're done.
- return UCOL_NO_MORE_CES;
+ if (collationSource->flags & UCOL_ITER_HASLEN) {
+ // Normal path for strings when length is specified.
+ // (We can't be in side buffer because it is always null terminated.)
+ if (collationSource->pos >= collationSource->endp) {
+ // Ran off of the end of the main source string. We're done.
+ return UCOL_NO_MORE_CES;
+ }
+ ch = *collationSource->pos++;
}
- ch = *collationSource->pos++;
- }
- else if(collationSource->flags & UCOL_USE_ITERATOR) {
- UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
- if(iterCh == U_SENTINEL) {
- return UCOL_NO_MORE_CES;
- }
- ch = (UChar)iterCh;
- }
- else
- {
- // Null terminated string.
- ch = *collationSource->pos++;
- if (ch == 0) {
- // Ran off end of buffer.
- if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
- // Ran off end of main string. backing up one character.
- collationSource->pos--;
+ else if(collationSource->flags & UCOL_USE_ITERATOR) {
+ UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
+ if(iterCh == U_SENTINEL) {
return UCOL_NO_MORE_CES;
}
- else
- {
- // Hit null in the normalize side buffer.
- // Usually this means the end of the normalized data,
- // except for one odd case: a null followed by combining chars,
- // which is the case if we are at the start of the buffer.
- if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
- break;
+ ch = (UChar)iterCh;
+ }
+ else
+ {
+ // Null terminated string.
+ ch = *collationSource->pos++;
+ if (ch == 0) {
+ // Ran off end of buffer.
+ if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
+ // Ran off end of main string. backing up one character.
+ collationSource->pos--;
+ return UCOL_NO_MORE_CES;
}
+ else
+ {
+ // Hit null in the normalize side buffer.
+ // Usually this means the end of the normalized data,
+ // except for one odd case: a null followed by combining chars,
+ // which is the case if we are at the start of the buffer.
+ if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
+ break;
+ }
- // Null marked end of side buffer.
- // Revert to the main string and
- // loop back to top to try again to get a character.
- collationSource->pos = collationSource->fcdPosition;
- collationSource->flags = collationSource->origFlags;
- continue;
+ // Null marked end of side buffer.
+ // Revert to the main string and
+ // loop back to top to try again to get a character.
+ collationSource->pos = collationSource->fcdPosition;
+ collationSource->flags = collationSource->origFlags;
+ continue;
+ }
}
}
- }
- if(collationSource->flags&UCOL_HIRAGANA_Q) {
- /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
- * based on whether the previous codepoint was Hiragana or Katakana.
- */
- if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
- ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
- collationSource->flags |= UCOL_WAS_HIRAGANA;
- } else {
- collationSource->flags &= ~UCOL_WAS_HIRAGANA;
+ if(collationSource->flags&UCOL_HIRAGANA_Q) {
+ /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
+ * based on whether the previous codepoint was Hiragana or Katakana.
+ */
+ if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
+ ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
+ collationSource->flags |= UCOL_WAS_HIRAGANA;
+ } else {
+ collationSource->flags &= ~UCOL_WAS_HIRAGANA;
+ }
}
- }
- // We've got a character. See if there's any fcd and/or normalization stuff to do.
- // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
- if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
- break;
- }
+ // We've got a character. See if there's any fcd and/or normalization stuff to do.
+ // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
+ if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
+ break;
+ }
- if (collationSource->fcdPosition >= collationSource->pos) {
- // An earlier FCD check has already covered the current character.
- // We can go ahead and process this char.
- break;
- }
-
- if (ch < ZERO_CC_LIMIT_ ) {
- // Fast fcd safe path. Trailing combining class == 0. This char is OK.
- break;
- }
-
- if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
- // We need to peek at the next character in order to tell if we are FCD
- if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
- // We are at the last char of source string.
- // It is always OK for FCD check.
+ if (collationSource->fcdPosition >= collationSource->pos) {
+ // An earlier FCD check has already covered the current character.
+ // We can go ahead and process this char.
break;
}
- // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
- if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
+ if (ch < ZERO_CC_LIMIT_ ) {
+ // Fast fcd safe path. Trailing combining class == 0. This char is OK.
break;
}
- }
+ if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
+ // We need to peek at the next character in order to tell if we are FCD
+ if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
+ // We are at the last char of source string.
+ // It is always OK for FCD check.
+ break;
+ }
- // Need a more complete FCD check and possible normalization.
- if (collIterFCD(collationSource)) {
- collIterNormalize(collationSource);
- }
- if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
- // No normalization was needed. Go ahead and process the char we already had.
- break;
- }
+ // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
+ if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
+ break;
+ }
+ }
- // Some normalization happened. Next loop iteration will pick up a char
- // from the normalization buffer.
- } // end for (;;)
+ // Need a more complete FCD check and possible normalization.
+ if (collIterFCD(collationSource)) {
+ collIterNormalize(collationSource);
+ }
+ if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
+ // No normalization was needed. Go ahead and process the char we already had.
+ break;
+ }
+ // Some normalization happened. Next loop iteration will pick up a char
+ // from the normalization buffer.
- if (ch <= 0xFF) {
- /* For latin-1 characters we never need to fall back to the UCA table */
- /* because all of the UCA data is replicated in the latinOneMapping array */
- order = coll->latinOneMapping[ch];
- if (order > UCOL_NOT_FOUND) {
- order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
+ } // end for (;;)
+
+
+ if (ch <= 0xFF) {
+ /* For latin-1 characters we never need to fall back to the UCA table */
+ /* because all of the UCA data is replicated in the latinOneMapping array */
+ order = coll->latinOneMapping[ch];
+ if (order > UCOL_NOT_FOUND) {
+ order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
+ }
}
- }
- else
- {
- // Always use UCA for Han, Hangul
- // (Han extension A is before main Han block)
- // **** Han compatibility chars ?? ****
- if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
- (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
- if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
- // between the two target ranges; do normal lookup
- // **** this range is YI, Modifier tone letters, ****
- // **** Latin-D, Syloti Nagari, Phagas-pa. ****
- // **** Latin-D might be tailored, so we need to ****
- // **** do the normal lookup for these guys. ****
+ else
+ {
+ // Always use UCA for Han, Hangul
+ // (Han extension A is before main Han block)
+ // **** Han compatibility chars ?? ****
+ if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
+ (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
+ if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
+ // between the two target ranges; do normal lookup
+ // **** this range is YI, Modifier tone letters, ****
+ // **** Latin-D, Syloti Nagari, Phagas-pa. ****
+ // **** Latin-D might be tailored, so we need to ****
+ // **** do the normal lookup for these guys. ****
+ order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+ } else {
+ // in one of the target ranges; use UCA
+ order = UCOL_NOT_FOUND;
+ }
+ } else {
order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
- } else {
- // in one of the target ranges; use UCA
- order = UCOL_NOT_FOUND;
}
- } else {
- order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
- }
- if(order > UCOL_NOT_FOUND) { /* if a CE is special */
- order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
- }
+ if(order > UCOL_NOT_FOUND) { /* if a CE is special */
+ order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
+ }
- if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
- /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
- order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
+ if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
+ /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
+ order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
- if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
- order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
+ if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
+ order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
+ }
}
}
- }
+ } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
+
if(order == UCOL_NOT_FOUND) {
order = getImplicit(ch, collationSource);
}
@@ -1958,161 +1961,163 @@
else {
UChar ch = 0;
- /*
- Loop handles case when incremental normalize switches to or from the
- side buffer / original string, and we need to start again to get the
- next character.
- */
- for (;;) {
- if (data->flags & UCOL_ITER_HASLEN) {
- /*
- Normal path for strings when length is specified.
- Not in side buffer because it is always null terminated.
- */
- if (data->pos <= data->string) {
- /* End of the main source string */
- return UCOL_NO_MORE_CES;
- }
- data->pos --;
- ch = *data->pos;
- }
- // we are using an iterator to go back. Pray for us!
- else if (data->flags & UCOL_USE_ITERATOR) {
- UChar32 iterCh = data->iterator->previous(data->iterator);
- if(iterCh == U_SENTINEL) {
- return UCOL_NO_MORE_CES;
- } else {
- ch = (UChar)iterCh;
- }
- }
- else {
- data->pos --;
- ch = *data->pos;
- /* we are in the side buffer. */
- if (ch == 0) {
+ do {
+ /*
+ Loop handles case when incremental normalize switches to or from the
+ side buffer / original string, and we need to start again to get the
+ next character.
+ */
+ for (;;) {
+ if (data->flags & UCOL_ITER_HASLEN) {
/*
- At the start of the normalize side buffer.
- Go back to string.
- Because pointer points to the last accessed character,
- hence we have to increment it by one here.
+ Normal path for strings when length is specified.
+ Not in side buffer because it is always null terminated.
*/
- data->flags = data->origFlags;
- data->offsetRepeatValue = 0;
-
- if (data->fcdPosition == NULL) {
- data->pos = data->string;
+ if (data->pos <= data->string) {
+ /* End of the main source string */
return UCOL_NO_MORE_CES;
}
- else {
- data->pos = data->fcdPosition + 1;
+ data->pos --;
+ ch = *data->pos;
+ }
+ // we are using an iterator to go back. Pray for us!
+ else if (data->flags & UCOL_USE_ITERATOR) {
+ UChar32 iterCh = data->iterator->previous(data->iterator);
+ if(iterCh == U_SENTINEL) {
+ return UCOL_NO_MORE_CES;
+ } else {
+ ch = (UChar)iterCh;
+ }
+ }
+ else {
+ data->pos --;
+ ch = *data->pos;
+ /* we are in the side buffer. */
+ if (ch == 0) {
+ /*
+ At the start of the normalize side buffer.
+ Go back to string.
+ Because pointer points to the last accessed character,
+ hence we have to increment it by one here.
+ */
+ data->flags = data->origFlags;
+ data->offsetRepeatValue = 0;
+
+ if (data->fcdPosition == NULL) {
+ data->pos = data->string;
+ return UCOL_NO_MORE_CES;
+ }
+ else {
+ data->pos = data->fcdPosition + 1;
+ }
+
+ continue;
}
-
- continue;
}
- }
- if(data->flags&UCOL_HIRAGANA_Q) {
- if(ch>=0x3040 && ch<=0x309f) {
- data->flags |= UCOL_WAS_HIRAGANA;
- } else {
- data->flags &= ~UCOL_WAS_HIRAGANA;
- }
- }
+ if(data->flags&UCOL_HIRAGANA_Q) {
+ if(ch>=0x3040 && ch<=0x309f) {
+ data->flags |= UCOL_WAS_HIRAGANA;
+ } else {
+ data->flags &= ~UCOL_WAS_HIRAGANA;
+ }
+ }
- /*
- * got a character to determine if there's fcd and/or normalization
- * stuff to do.
- * if the current character is not fcd.
- * if current character is at the start of the string
- * Trailing combining class == 0.
- * Note if pos is in the writablebuffer, norm is always 0
- */
- if (ch < ZERO_CC_LIMIT_ ||
- // this should propel us out of the loop in the iterator case
- (data->flags & UCOL_ITER_NORM) == 0 ||
- (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
- || data->string == data->pos) {
- break;
- }
-
- if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
- /* if next character is FCD */
- if (data->pos == data->string) {
- /* First char of string is always OK for FCD check */
+ /*
+ * got a character to determine if there's fcd and/or normalization
+ * stuff to do.
+ * if the current character is not fcd.
+ * if current character is at the start of the string
+ * Trailing combining class == 0.
+ * Note if pos is in the writablebuffer, norm is always 0
+ */
+ if (ch < ZERO_CC_LIMIT_ ||
+ // this should propel us out of the loop in the iterator case
+ (data->flags & UCOL_ITER_NORM) == 0 ||
+ (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
+ || data->string == data->pos) {
break;
}
- /* Not first char of string, do the FCD fast test */
- if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
+ if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
+ /* if next character is FCD */
+ if (data->pos == data->string) {
+ /* First char of string is always OK for FCD check */
+ break;
+ }
+
+ /* Not first char of string, do the FCD fast test */
+ if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
+ break;
+ }
+ }
+
+ /* Need a more complete FCD check and possible normalization. */
+ if (collPrevIterFCD(data)) {
+ collPrevIterNormalize(data);
+ }
+
+ if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
+ /* No normalization. Go ahead and process the char. */
break;
}
- }
- /* Need a more complete FCD check and possible normalization. */
- if (collPrevIterFCD(data)) {
- collPrevIterNormalize(data);
+ /*
+ Some normalization happened.
+ Next loop picks up a char from the normalization buffer.
+ */
}
- if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
- /* No normalization. Go ahead and process the char. */
- break;
- }
-
- /*
- Some normalization happened.
- Next loop picks up a char from the normalization buffer.
+ /* attempt to handle contractions, after removal of the backwards
+ contraction
*/
- }
-
- /* attempt to handle contractions, after removal of the backwards
- contraction
- */
- if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
- result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
- } else {
- if (ch <= 0xFF) {
- result = coll->latinOneMapping[ch];
- }
- else {
- // Always use UCA for [3400..9FFF], [AC00..D7AF]
- // **** [FA0E..FA2F] ?? ****
- if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
- (ch >= 0x3400 && ch <= 0xD7AF)) {
- if (ch > 0x9FFF && ch < 0xAC00) {
- // between the two target ranges; do normal lookup
- // **** this range is YI, Modifier tone letters, ****
- // **** Latin-D, Syloti Nagari, Phagas-pa. ****
- // **** Latin-D might be tailored, so we need to ****
- // **** do the normal lookup for these guys. ****
- result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+ if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
+ result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
+ } else {
+ if (ch <= 0xFF) {
+ result = coll->latinOneMapping[ch];
+ }
+ else {
+ // Always use UCA for [3400..9FFF], [AC00..D7AF]
+ // **** [FA0E..FA2F] ?? ****
+ if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
+ (ch >= 0x3400 && ch <= 0xD7AF)) {
+ if (ch > 0x9FFF && ch < 0xAC00) {
+ // between the two target ranges; do normal lookup
+ // **** this range is YI, Modifier tone letters, ****
+ // **** Latin-D, Syloti Nagari, Phagas-pa. ****
+ // **** Latin-D might be tailored, so we need to ****
+ // **** do the normal lookup for these guys. ****
+ result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
+ } else {
+ result = UCOL_NOT_FOUND;
+ }
} else {
- result = UCOL_NOT_FOUND;
+ result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
}
- } else {
- result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
}
- }
- if (result > UCOL_NOT_FOUND) {
- result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
- }
- if (result == UCOL_NOT_FOUND) { // Not found in master list
- if (!isAtStartPrevIterate(data) &&
- ucol_contractionEndCP(ch, data->coll))
- {
- result = UCOL_CONTRACTION;
- } else {
- if(coll->UCA) {
- result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
+ if (result > UCOL_NOT_FOUND) {
+ result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
+ }
+ if (result == UCOL_NOT_FOUND) { // Not found in master list
+ if (!isAtStartPrevIterate(data) &&
+ ucol_contractionEndCP(ch, data->coll))
+ {
+ result = UCOL_CONTRACTION;
+ } else {
+ if(coll->UCA) {
+ result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
+ }
}
- }
- if (result > UCOL_NOT_FOUND) {
- if(coll->UCA) {
- result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
+ if (result > UCOL_NOT_FOUND) {
+ if(coll->UCA) {
+ result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
+ }
}
}
}
- }
+ } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
if(result == UCOL_NOT_FOUND) {
result = getPrevImplicit(ch, data);
@@ -3193,6 +3198,7 @@
// Since Hanguls pass the FCD check, it is
// guaranteed that we won't be in
// the normalization buffer if something like this happens
+
// However, if we are using a uchar iterator and normalization
// is ON, the Hangul that lead us here is going to be in that
// normalization buffer. Here we want to restore the uchar
@@ -3201,6 +3207,7 @@
source->flags = source->origFlags; // restore the iterator
source->pos = NULL;
}
+
// Move Jamos into normalization buffer
UChar *buffer = source->writableBuffer.getBuffer(4);
int32_t bufferLength;
@@ -3214,8 +3221,9 @@
}
source->writableBuffer.releaseBuffer(bufferLength);
- source->fcdPosition = source->pos; // Indicate where to continue in main input string
- // after exhausting the writableBuffer
+ // Indicate where to continue in main input string after exhausting the writableBuffer
+ source->fcdPosition = source->pos;
+
source->pos = source->writableBuffer.getTerminatedBuffer();
source->origFlags = source->flags;
source->flags |= UCOL_ITER_INNORMBUF;
@@ -3966,13 +3974,10 @@
// Since Hanguls pass the FCD check, it is
// guaranteed that we won't be in
// the normalization buffer if something like this happens
+
// Move Jamos into normalization buffer
- /*
- Move the Jamos into the
- normalization buffer
- */
UChar *tempbuffer = source->writableBuffer.getBuffer(5);
- int32_t tempbufferLength;
+ int32_t tempbufferLength, jamoOffset;
tempbuffer[0] = 0;
tempbuffer[1] = (UChar)L;
tempbuffer[2] = (UChar)V;
@@ -3984,16 +3989,30 @@
}
source->writableBuffer.releaseBuffer(tempbufferLength);
- /*
- Indicate where to continue in main input string after exhausting
- the writableBuffer
- */
+ // Indicate where to continue in main input string after exhausting the writableBuffer
if (source->pos == source->string) {
+ jamoOffset = 0;
source->fcdPosition = NULL;
} else {
+ jamoOffset = source->pos - source->string;
source->fcdPosition = source->pos-1;
}
+
+ // Append offsets for the additional chars
+ // (not the 0, and not the L whose offsets match the original Hangul)
+ int32_t jamoRemaining = tempbufferLength - 2;
+ jamoOffset++; // appended offsets should match end of original Hangul
+ while (jamoRemaining-- > 0) {
+ source->appendOffset(jamoOffset, *status);
+ }
+ source->offsetRepeatValue = jamoOffset;
+
+ source->offsetReturn = source->offsetStore - 1;
+ if (source->offsetReturn == source->offsetBuffer) {
+ source->offsetStore = source->offsetBuffer;
+ }
+
source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
source->origFlags = source->flags;
source->flags |= UCOL_ITER_INNORMBUF;