third_party/icu/source/i18n/uspoof_wsconf.cpp - cobalt - Git at Google

 /*
 ******************************************************************************
 *
 *   Copyright (C) 2008-2013, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  uspoof_wsconf.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2009Jan05  (refactoring earlier files)
 *   created by: Andy Heninger
 *
 *   Internal functions for compililing Whole Script confusable source data
 *   into its binary (runtime) form.  The binary data format is described
 *   in uspoof_impl.h
 */

 #include "unicode/utypes.h"
 #include "unicode/uspoof.h"

 #if !UCONFIG_NO_NORMALIZATION

 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

 #if defined(STARBOARD)
 #include "starboard/client_porting/poem/assert_poem.h"
 #include "starboard/client_porting/poem/string_poem.h"
 #endif  // defined(STARBOARD)
 #include "unicode/unorm.h"
 #include "unicode/uregex.h"
 #include "unicode/ustring.h"
 #include "cmemory.h"
 #include "scriptset.h"
 #include "uspoof_impl.h"
 #include "uhash.h"
 #include "uvector.h"
 #include "uassert.h"
 #include "uspoof_wsconf.h"

 U_NAMESPACE_USE


 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
 // Example Lines:
 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
 //    |               |     |    |
 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
 //    |               |     |----------Target script.   We need this.
 //    |               |----------------Src script.  Should match the script of the source
 //    |                                code points.  Beyond checking that, we don't keep it.
 //    |--------------------------------Source code points or range.
 //
 // The expression will match _all_ lines, including erroneous lines.
 // The result of the parse is returned via the contents of the (match) groups.
 static const char *parseExp =
         "(?m)"                                         // Multi-line mode
         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
         "|^(?:"                                        //   OR
         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
         "[ \\t]*(?:#.*?)?"                             // Trailing commment
         ")$|"                                          //   OR
         "^(.*?)$";                                     // An error line.      Group 8.
                                                        //    Any line not matching the preceding
                                                        //    parts of the expression.will match
                                                        //    this, and thus be flagged as an error


 // Extract a regular expression match group into a char * string.
 //    The group must contain only invariant characters.
 //    Used for script names
 //
 static void extractGroup(
     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {

     UChar ubuf[50];
     ubuf[0] = 0;
     destBuf[0] = 0;
     int32_t len = uregex_group(e, group, ubuf, 50, &status);
     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
         return;
     }
     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
     s.extract(0, len, destBuf, destCapacity, US_INV);
 }


 U_NAMESPACE_BEGIN

 //  Build the Whole Script Confusable data
 //
 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
 //                         because everything is local to this one build function anyhow,
 //                           OR
 //                         break this function into more reasonably sized pieces, with
 //                         state in WSConfusableDataBuilder.
 //
 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
 {
     if (U_FAILURE(status)) {
         return;
     }
     URegularExpression *parseRegexp = NULL;
     int32_t             inputLen    = 0;
     UChar              *input       = NULL;
     int32_t             lineNum     = 0;

     UVector            *scriptSets        = NULL;
     uint32_t            rtScriptSetsCount = 2;

     UTrie2             *anyCaseTrie   = NULL;
     UTrie2             *lowerCaseTrie = NULL;

     anyCaseTrie = utrie2_open(0, 0, &status);
     lowerCaseTrie = utrie2_open(0, 0, &status);

     UnicodeString pattern(parseExp, -1, US_INV);

     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
     //
     // Reserved TRIE values:
     //   0:  Code point has no whole script confusables.
     //   1:  Code point is of script Common or Inherited.
     //       These code points do not participate in whole script confusable detection.
     //       (This is logically equivalent to saying that they contain confusables in
     //        all scripts)
     //
     // Because Trie values are indexes into the ScriptSets vector, pre-fill
     // vector positions 0 and 1 to avoid conflicts with the reserved values.

     scriptSets = new UVector(status);
     if (scriptSets == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         goto cleanup;
     }
     scriptSets->addElement((void *)NULL, status);
     scriptSets->addElement((void *)NULL, status);

     // Convert the user input data from UTF-8 to UChar (UTF-16)
     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
     if (status != U_BUFFER_OVERFLOW_ERROR) {
         goto cleanup;
     }
     status = U_ZERO_ERROR;
     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
     if (input == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         goto cleanup;
     }
     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);

     parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);

     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
     //   given the syntax of the input.
     if (*input == 0xfeff) {
         *input = 0x20;
     }

     // Parse the input, one line per iteration of this loop.
     uregex_setText(parseRegexp, input, inputLen, &status);
     while (uregex_findNext(parseRegexp, &status)) {
         lineNum++;
         if (uregex_start(parseRegexp, 1, &status) >= 0) {
             // this was a blank or comment line.
             continue;
         }
         if (uregex_start(parseRegexp, 8, &status) >= 0) {
             // input file syntax error.
             status = U_PARSE_ERROR;
             goto cleanup;
         }
         if (U_FAILURE(status)) {
             goto cleanup;
         }

         // Pick up the start and optional range end code points from the parsed line.
         UChar32  startCodePoint = SpoofImpl::ScanHex(
             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
         UChar32  endCodePoint = startCodePoint;
         if (uregex_start(parseRegexp, 3, &status) >=0) {
             endCodePoint = SpoofImpl::ScanHex(
                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
         }

         // Extract the two script names from the source line.  We need these in an 8 bit
         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
         char  srcScriptName[20];
         char  targScriptName[20];
         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
         UScriptCode srcScript  =
             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
         UScriptCode targScript =
             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
         if (U_FAILURE(status)) {
             goto cleanup;
         }
         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
             status = U_INVALID_FORMAT_ERROR;
             goto cleanup;
         }

         // select the table - (A) any case or (L) lower case only
         UTrie2 *table = anyCaseTrie;
         if (uregex_start(parseRegexp, 7, &status) >= 0) {
             table = lowerCaseTrie;
         }

         // Build the set of scripts containing confusable characters for
         //   the code point(s) specified in this input line.
         // Sanity check that the script of the source code point is the same
         //   as the source script indicated in the input file.  Failure of this check is
         //   an error in the input file.
         // Include the source script in the set (needed for Mixed Script Confusable detection).
         //
         UChar32 cp;
         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
             int32_t setIndex = utrie2_get32(table, cp);
             BuilderScriptSet *bsset = NULL;
             if (setIndex > 0) {
                 U_ASSERT(setIndex < scriptSets->size());
                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
             } else {
                 bsset = new BuilderScriptSet();
                 if (bsset == NULL) {
                     status = U_MEMORY_ALLOCATION_ERROR;
                     goto cleanup;
                 }
                 bsset->codePoint = cp;
                 bsset->trie = table;
                 bsset->sset = new ScriptSet();
                 setIndex = scriptSets->size();
                 bsset->index = setIndex;
                 bsset->rindex = 0;
                 if (bsset->sset == NULL) {
                     status = U_MEMORY_ALLOCATION_ERROR;
                     goto cleanup;
                 }
                 scriptSets->addElement(bsset, status);
                 utrie2_set32(table, cp, setIndex, &status);
             }
             bsset->sset->set(targScript, status);
             bsset->sset->set(srcScript, status);

             if (U_FAILURE(status)) {
                 goto cleanup;
             }
             UScriptCode cpScript = uscript_getScript(cp, &status);
             if (cpScript != srcScript) {
                 status = U_INVALID_FORMAT_ERROR;
                 goto cleanup;
             }
         }
     }

     // Eliminate duplicate script sets.  At this point we have a separate
     // script set for every code point that had data in the input file.
     //
     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
     //
     // printf("Number of scriptSets: %d\n", scriptSets->size());
     {
         int32_t duplicateCount = 0;
         rtScriptSetsCount = 2;
         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
             if (outerSet->index != static_cast<uint32_t>(outeri)) {
                 // This set was already identified as a duplicate.
                 //   It will not be allocated a position in the runtime array of ScriptSets.
                 continue;
             }
             outerSet->rindex = rtScriptSetsCount++;
             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
                     delete innerSet->sset;
                     innerSet->scriptSetOwned = FALSE;
                     innerSet->sset = outerSet->sset;
                     innerSet->index = outeri;
                     innerSet->rindex = outerSet->rindex;
                     duplicateCount++;
                 }
                 // But this doesn't get all.  We need to fix the TRIE.
             }
         }
         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
     }


     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
     //     are unused, which is why the loop index starts at 2.)
     {
         for (int32_t i=2; i<scriptSets->size(); i++) {
             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
             if (bSet->rindex != (uint32_t)i) {
                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
             }
         }
     }

     // For code points with script==Common or script==Inherited,
     //   Set the reserved value of 1 into both Tries.  These characters do not participate
     //   in Whole Script Confusable detection; this reserved value is the means
     //   by which they are detected.
     {
         UnicodeSet ignoreSet;
         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
         UnicodeSet inheritedSet;
         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
         ignoreSet.addAll(inheritedSet);
         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
         }
     }

     // Serialize the data to the Spoof Detector
     {
         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
         // printf("Any case Trie size: %d\n", size);
         if (status != U_BUFFER_OVERFLOW_ERROR) {
             goto cleanup;
         }
         status = U_ZERO_ERROR;
         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
         void *where = spImpl->fSpoofData->reserveSpace(size, status);
         utrie2_serialize(anyCaseTrie, where, size, &status);

         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
         // printf("Lower case Trie size: %d\n", size);
         if (status != U_BUFFER_OVERFLOW_ERROR) {
             goto cleanup;
         }
         status = U_ZERO_ERROR;
         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
         where = spImpl->fSpoofData->reserveSpace(size, status);
         utrie2_serialize(lowerCaseTrie, where, size, &status);

         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
         uint32_t rindex = 2;
         for (int32_t i=2; i<scriptSets->size(); i++) {
             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
             if (bSet->rindex < rindex) {
                 // We have already copied this script set to the serialized data.
                 continue;
             }
             U_ASSERT(rindex == bSet->rindex);
             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
             rindex++;
         }
     }

     // Open new utrie2s from the serialized data.  We don't want to keep the ones
     //   we just built because we would then have two copies of the data, one internal to
     //   the utries that we have already constructed, and one in the serialized data area.
     //   An alternative would be to not pre-serialize the Trie data, but that makes the
     //   spoof detector data different, depending on how the detector was constructed.
     //   It's simpler to keep the data always the same.

     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
             UTRIE2_16_VALUE_BITS,
             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
             NULL,
             &status);

     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
             UTRIE2_16_VALUE_BITS,
             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
             NULL,
             &status);


 cleanup:
     if (U_FAILURE(status)) {
         pe->line = lineNum;
     }
     uregex_close(parseRegexp);
     uprv_free(input);

     int32_t i;
     if (scriptSets != NULL) {
         for (i=0; i<scriptSets->size(); i++) {
             BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
             delete bsset;
         }
         delete scriptSets;
     }
     utrie2_close(anyCaseTrie);
     utrie2_close(lowerCaseTrie);
     return;
 }

 U_NAMESPACE_END


 BuilderScriptSet::BuilderScriptSet() {
     codePoint = -1;
     trie = NULL;
     sset = NULL;
     index = 0;
     rindex = 0;
     scriptSetOwned = TRUE;
 }

 BuilderScriptSet::~BuilderScriptSet() {
     if (scriptSetOwned) {
         delete sset;
     }
 }

 #endif
 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
	/*
	******************************************************************************
	*
	* Copyright (C) 2008-2013, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	******************************************************************************
	* file name: uspoof_wsconf.cpp
	* encoding: US-ASCII
	* tab size: 8 (not used)
	* indentation:4
	*
	* created on: 2009Jan05 (refactoring earlier files)
	* created by: Andy Heninger
	*
	* Internal functions for compililing Whole Script confusable source data
	* into its binary (runtime) form. The binary data format is described
	* in uspoof_impl.h
	*/

	#include "unicode/utypes.h"
	#include "unicode/uspoof.h"

	#if !UCONFIG_NO_NORMALIZATION

	#if !UCONFIG_NO_REGULAR_EXPRESSIONS

	#if defined(STARBOARD)
	#include "starboard/client_porting/poem/assert_poem.h"
	#include "starboard/client_porting/poem/string_poem.h"
	#endif // defined(STARBOARD)
	#include "unicode/unorm.h"
	#include "unicode/uregex.h"
	#include "unicode/ustring.h"
	#include "cmemory.h"
	#include "scriptset.h"
	#include "uspoof_impl.h"
	#include "uhash.h"
	#include "uvector.h"
	#include "uassert.h"
	#include "uspoof_wsconf.h"

	U_NAMESPACE_USE


	// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
	// Example Lines:
	// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
	// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
	// \| \| \| \|
	// \| \| \| \|---- Which table, Any Case or Lower Case (A or L)
	// \| \| \|----------Target script. We need this.
	// \| \|----------------Src script. Should match the script of the source
	// \| code points. Beyond checking that, we don't keep it.
	// \|--------------------------------Source code points or range.
	//
	// The expression will match _all_ lines, including erroneous lines.
	// The result of the parse is returned via the contents of the (match) groups.
	static const char *parseExp =
	"(?m)" // Multi-line mode
	"^([ \\t](?:#.?)?)$" // A blank or comment line. Matches Group 1.
	"\|^(?:" // OR
	"\\s([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s;" // Code point range. Groups 2 and 3.
	"\\s([A-Za-z]+)\\s;" // The source script. Group 4.
	"\\s([A-Za-z]+)\\s;" // The target script. Group 5.
	"\\s*(?:(A)\|(L))" // The table A or L. Group 6 or 7
	"[ \\t](?:#.?)?" // Trailing commment
	")$\|" // OR
	"^(.*?)$"; // An error line. Group 8.
	// Any line not matching the preceding
	// parts of the expression.will match
	// this, and thus be flagged as an error


	// Extract a regular expression match group into a char * string.
	// The group must contain only invariant characters.
	// Used for script names
	//
	static void extractGroup(
	URegularExpression e, int32_t group, char destBuf, int32_t destCapacity, UErrorCode &status) {

	UChar ubuf[50];
	ubuf[0] = 0;
	destBuf[0] = 0;
	int32_t len = uregex_group(e, group, ubuf, 50, &status);
	if (U_FAILURE(status) \|\| len == -1 \|\| len >= destCapacity) {
	return;
	}
	UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
	s.extract(0, len, destBuf, destCapacity, US_INV);
	}



	U_NAMESPACE_BEGIN

	// Build the Whole Script Confusable data
	//
	// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
	// because everything is local to this one build function anyhow,
	// OR
	// break this function into more reasonably sized pieces, with
	// state in WSConfusableDataBuilder.
	//
	void buildWSConfusableData(SpoofImpl spImpl, const char confusablesWS,
	int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
	{
	if (U_FAILURE(status)) {
	return;
	}
	URegularExpression *parseRegexp = NULL;
	int32_t inputLen = 0;
	UChar *input = NULL;
	int32_t lineNum = 0;

	UVector *scriptSets = NULL;
	uint32_t rtScriptSetsCount = 2;

	UTrie2 *anyCaseTrie = NULL;
	UTrie2 *lowerCaseTrie = NULL;

	anyCaseTrie = utrie2_open(0, 0, &status);
	lowerCaseTrie = utrie2_open(0, 0, &status);

	UnicodeString pattern(parseExp, -1, US_INV);

	// The scriptSets vector provides a mapping from TRIE values to the set of scripts.
	//
	// Reserved TRIE values:
	// 0: Code point has no whole script confusables.
	// 1: Code point is of script Common or Inherited.
	// These code points do not participate in whole script confusable detection.
	// (This is logically equivalent to saying that they contain confusables in
	// all scripts)
	//
	// Because Trie values are indexes into the ScriptSets vector, pre-fill
	// vector positions 0 and 1 to avoid conflicts with the reserved values.

	scriptSets = new UVector(status);
	if (scriptSets == NULL) {
	status = U_MEMORY_ALLOCATION_ERROR;
	goto cleanup;
	}
	scriptSets->addElement((void *)NULL, status);
	scriptSets->addElement((void *)NULL, status);

	// Convert the user input data from UTF-8 to UChar (UTF-16)
	u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
	if (status != U_BUFFER_OVERFLOW_ERROR) {
	goto cleanup;
	}
	status = U_ZERO_ERROR;
	input = static_cast<UChar >(uprv_malloc((inputLen+1) sizeof(UChar)));
	if (input == NULL) {
	status = U_MEMORY_ALLOCATION_ERROR;
	goto cleanup;
	}
	u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);

	parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);

	// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
	// given the syntax of the input.
	if (*input == 0xfeff) {
	*input = 0x20;
	}

	// Parse the input, one line per iteration of this loop.
	uregex_setText(parseRegexp, input, inputLen, &status);
	while (uregex_findNext(parseRegexp, &status)) {
	lineNum++;
	if (uregex_start(parseRegexp, 1, &status) >= 0) {
	// this was a blank or comment line.
	continue;
	}
	if (uregex_start(parseRegexp, 8, &status) >= 0) {
	// input file syntax error.
	status = U_PARSE_ERROR;
	goto cleanup;
	}
	if (U_FAILURE(status)) {
	goto cleanup;
	}

	// Pick up the start and optional range end code points from the parsed line.
	UChar32 startCodePoint = SpoofImpl::ScanHex(
	input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
	UChar32 endCodePoint = startCodePoint;
	if (uregex_start(parseRegexp, 3, &status) >=0) {
	endCodePoint = SpoofImpl::ScanHex(
	input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
	}

	// Extract the two script names from the source line. We need these in an 8 bit
	// default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
	// to the ICU u_getPropertyValueEnum() function. Ugh.
	char srcScriptName[20];
	char targScriptName[20];
	extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
	extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
	UScriptCode srcScript =
	static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
	UScriptCode targScript =
	static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
	if (U_FAILURE(status)) {
	goto cleanup;
	}
	if (srcScript == USCRIPT_INVALID_CODE \|\| targScript == USCRIPT_INVALID_CODE) {
	status = U_INVALID_FORMAT_ERROR;
	goto cleanup;
	}

	// select the table - (A) any case or (L) lower case only
	UTrie2 *table = anyCaseTrie;
	if (uregex_start(parseRegexp, 7, &status) >= 0) {
	table = lowerCaseTrie;
	}

	// Build the set of scripts containing confusable characters for
	// the code point(s) specified in this input line.
	// Sanity check that the script of the source code point is the same
	// as the source script indicated in the input file. Failure of this check is
	// an error in the input file.
	// Include the source script in the set (needed for Mixed Script Confusable detection).
	//
	UChar32 cp;
	for (cp=startCodePoint; cp<=endCodePoint; cp++) {
	int32_t setIndex = utrie2_get32(table, cp);
	BuilderScriptSet *bsset = NULL;
	if (setIndex > 0) {
	U_ASSERT(setIndex < scriptSets->size());
	bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
	} else {
	bsset = new BuilderScriptSet();
	if (bsset == NULL) {
	status = U_MEMORY_ALLOCATION_ERROR;
	goto cleanup;
	}
	bsset->codePoint = cp;
	bsset->trie = table;
	bsset->sset = new ScriptSet();
	setIndex = scriptSets->size();
	bsset->index = setIndex;
	bsset->rindex = 0;
	if (bsset->sset == NULL) {
	status = U_MEMORY_ALLOCATION_ERROR;
	goto cleanup;
	}
	scriptSets->addElement(bsset, status);
	utrie2_set32(table, cp, setIndex, &status);
	}
	bsset->sset->set(targScript, status);
	bsset->sset->set(srcScript, status);

	if (U_FAILURE(status)) {
	goto cleanup;
	}
	UScriptCode cpScript = uscript_getScript(cp, &status);
	if (cpScript != srcScript) {
	status = U_INVALID_FORMAT_ERROR;
	goto cleanup;
	}
	}
	}

	// Eliminate duplicate script sets. At this point we have a separate
	// script set for every code point that had data in the input file.
	//
	// We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
	//
	// printf("Number of scriptSets: %d\n", scriptSets->size());
	{
	int32_t duplicateCount = 0;
	rtScriptSetsCount = 2;
	for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
	BuilderScriptSet outerSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(outeri));
	if (outerSet->index != static_cast<uint32_t>(outeri)) {
	// This set was already identified as a duplicate.
	// It will not be allocated a position in the runtime array of ScriptSets.
	continue;
	}
	outerSet->rindex = rtScriptSetsCount++;
	for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
	BuilderScriptSet innerSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(inneri));
	if ((outerSet->sset) == (innerSet->sset) && outerSet->sset != innerSet->sset) {
	delete innerSet->sset;
	innerSet->scriptSetOwned = FALSE;
	innerSet->sset = outerSet->sset;
	innerSet->index = outeri;
	innerSet->rindex = outerSet->rindex;
	duplicateCount++;
	}
	// But this doesn't get all. We need to fix the TRIE.
	}
	}
	// printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
	}



	// Update the Trie values to be reflect the run time script indexes (after duplicate merging).
	// (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
	// are unused, which is why the loop index starts at 2.)
	{
	for (int32_t i=2; i<scriptSets->size(); i++) {
	BuilderScriptSet bSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(i));
	if (bSet->rindex != (uint32_t)i) {
	utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
	}
	}
	}

	// For code points with script==Common or script==Inherited,
	// Set the reserved value of 1 into both Tries. These characters do not participate
	// in Whole Script Confusable detection; this reserved value is the means
	// by which they are detected.
	{
	UnicodeSet ignoreSet;
	ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
	UnicodeSet inheritedSet;
	inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
	ignoreSet.addAll(inheritedSet);
	for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
	UChar32 rangeStart = ignoreSet.getRangeStart(rn);
	UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
	utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
	utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
	}
	}

	// Serialize the data to the Spoof Detector
	{
	utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
	int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
	// printf("Any case Trie size: %d\n", size);
	if (status != U_BUFFER_OVERFLOW_ERROR) {
	goto cleanup;
	}
	status = U_ZERO_ERROR;
	spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
	spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
	spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
	void *where = spImpl->fSpoofData->reserveSpace(size, status);
	utrie2_serialize(anyCaseTrie, where, size, &status);

	utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
	size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
	// printf("Lower case Trie size: %d\n", size);
	if (status != U_BUFFER_OVERFLOW_ERROR) {
	goto cleanup;
	}
	status = U_ZERO_ERROR;
	spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
	spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
	spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
	where = spImpl->fSpoofData->reserveSpace(size, status);
	utrie2_serialize(lowerCaseTrie, where, size, &status);

	spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
	spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
	ScriptSet rtScriptSets = static_cast<ScriptSet >
	(spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
	uint32_t rindex = 2;
	for (int32_t i=2; i<scriptSets->size(); i++) {
	BuilderScriptSet bSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(i));
	if (bSet->rindex < rindex) {
	// We have already copied this script set to the serialized data.
	continue;
	}
	U_ASSERT(rindex == bSet->rindex);
	rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
	rindex++;
	}
	}

	// Open new utrie2s from the serialized data. We don't want to keep the ones
	// we just built because we would then have two copies of the data, one internal to
	// the utries that we have already constructed, and one in the serialized data area.
	// An alternative would be to not pre-serialize the Trie data, but that makes the
	// spoof detector data different, depending on how the detector was constructed.
	// It's simpler to keep the data always the same.

	spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
	UTRIE2_16_VALUE_BITS,
	(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
	spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
	NULL,
	&status);

	spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
	UTRIE2_16_VALUE_BITS,
	(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
	spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
	NULL,
	&status);



	cleanup:
	if (U_FAILURE(status)) {
	pe->line = lineNum;
	}
	uregex_close(parseRegexp);
	uprv_free(input);

	int32_t i;
	if (scriptSets != NULL) {
	for (i=0; i<scriptSets->size(); i++) {
	BuilderScriptSet bsset = static_cast<BuilderScriptSet >(scriptSets->elementAt(i));
	delete bsset;
	}
	delete scriptSets;
	}
	utrie2_close(anyCaseTrie);
	utrie2_close(lowerCaseTrie);
	return;
	}

	U_NAMESPACE_END



	BuilderScriptSet::BuilderScriptSet() {
	codePoint = -1;
	trie = NULL;
	sset = NULL;
	index = 0;
	rindex = 0;
	scriptSetOwned = TRUE;
	}

	BuilderScriptSet::~BuilderScriptSet() {
	if (scriptSetOwned) {
	delete sset;
	}
	}

	#endif
	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS