| /* |
| ****************************************************************************** |
| * |
| * Copyright (C) 2008-2009, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ****************************************************************************** |
| * file name: uspoof_wsconf.cpp |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2009Jan05 (refactoring earlier files) |
| * created by: Andy Heninger |
| * |
| * Internal functions for compililing Whole Script confusable source data |
| * into its binary (runtime) form. The binary data format is described |
| * in uspoof_impl.h |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/uspoof.h" |
| |
| #if !UCONFIG_NO_NORMALIZATION |
| |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| |
| #include "unicode/unorm.h" |
| #include "unicode/uregex.h" |
| #include "unicode/ustring.h" |
| #include "cmemory.h" |
| #include "uspoof_impl.h" |
| #include "uhash.h" |
| #include "uvector.h" |
| #include "uassert.h" |
| #include "uspoof_wsconf.h" |
| |
| U_NAMESPACE_USE |
| |
| |
| // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt |
| // Example Lines: |
| // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O |
| // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I |
| // | | | | |
| // | | | |---- Which table, Any Case or Lower Case (A or L) |
| // | | |----------Target script. We need this. |
| // | |----------------Src script. Should match the script of the source |
| // | code points. Beyond checking that, we don't keep it. |
| // |--------------------------------Source code points or range. |
| // |
| // The expression will match _all_ lines, including erroneous lines. |
| // The result of the parse is returned via the contents of the (match) groups. |
| static const char *parseExp = |
| |
| "(?m)" // Multi-line mode |
| "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. |
| "|^(?:" // OR |
| "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. |
| "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. |
| "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. |
| "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 |
| "[ \\t]*(?:#.*?)?" // Trailing commment |
| ")$|" // OR |
| "^(.*?)$"; // An error line. Group 8. |
| // Any line not matching the preceding |
| // parts of the expression.will match |
| // this, and thus be flagged as an error |
| |
| |
| // Extract a regular expression match group into a char * string. |
| // The group must contain only invariant characters. |
| // Used for script names |
| // |
| static void extractGroup( |
| URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { |
| |
| UChar ubuf[50]; |
| ubuf[0] = 0; |
| destBuf[0] = 0; |
| int32_t len = uregex_group(e, group, ubuf, 50, &status); |
| if (U_FAILURE(status) || len == -1 || len >= destCapacity) { |
| return; |
| } |
| UnicodeString s(FALSE, ubuf, len); // Aliasing constructor |
| s.extract(0, len, destBuf, destCapacity, US_INV); |
| } |
| |
| |
| |
| // Build the Whole Script Confusable data |
| // |
| // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, |
| // because everything is local to this one build function anyhow, |
| // OR |
| // break this function into more reasonably sized pieces, with |
| // state in WSConfusableDataBuilder. |
| // |
| void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, |
| int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| URegularExpression *parseRegexp = NULL; |
| int32_t inputLen = 0; |
| UChar *input = NULL; |
| int32_t lineNum = 0; |
| |
| UVector *scriptSets = NULL; |
| uint32_t rtScriptSetsCount = 2; |
| |
| UTrie2 *anyCaseTrie = NULL; |
| UTrie2 *lowerCaseTrie = NULL; |
| |
| anyCaseTrie = utrie2_open(0, 0, &status); |
| lowerCaseTrie = utrie2_open(0, 0, &status); |
| |
| |
| // The scriptSets vector provides a mapping from TRIE values to the set of scripts. |
| // |
| // Reserved TRIE values: |
| // 0: Code point has no whole script confusables. |
| // 1: Code point is of script Common or Inherited. |
| // These code points do not participate in whole script confusable detection. |
| // (This is logically equivalent to saying that they contain confusables in |
| // all scripts) |
| // |
| // Because Trie values are indexes into the ScriptSets vector, pre-fill |
| // vector positions 0 and 1 to avoid conflicts with the reserved values. |
| |
| scriptSets = new UVector(status); |
| if (scriptSets == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| goto cleanup; |
| } |
| scriptSets->addElement((void *)NULL, status); |
| scriptSets->addElement((void *)NULL, status); |
| |
| // Convert the user input data from UTF-8 to UChar (UTF-16) |
| u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); |
| if (status != U_BUFFER_OVERFLOW_ERROR) { |
| goto cleanup; |
| } |
| status = U_ZERO_ERROR; |
| input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); |
| if (input == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| goto cleanup; |
| } |
| u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); |
| |
| |
| |
| parseRegexp = uregex_openC(parseExp, 0, NULL, &status); |
| |
| // Zap any Byte Order Mark at the start of input. Changing it to a space is benign |
| // given the syntax of the input. |
| if (*input == 0xfeff) { |
| *input = 0x20; |
| } |
| |
| // Parse the input, one line per iteration of this loop. |
| uregex_setText(parseRegexp, input, inputLen, &status); |
| while (uregex_findNext(parseRegexp, &status)) { |
| lineNum++; |
| UChar line[200]; |
| uregex_group(parseRegexp, 0, line, 200, &status); |
| if (uregex_start(parseRegexp, 1, &status) >= 0) { |
| // this was a blank or comment line. |
| continue; |
| } |
| if (uregex_start(parseRegexp, 8, &status) >= 0) { |
| // input file syntax error. |
| status = U_PARSE_ERROR; |
| goto cleanup; |
| } |
| if (U_FAILURE(status)) { |
| goto cleanup; |
| } |
| |
| // Pick up the start and optional range end code points from the parsed line. |
| UChar32 startCodePoint = SpoofImpl::ScanHex( |
| input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); |
| UChar32 endCodePoint = startCodePoint; |
| if (uregex_start(parseRegexp, 3, &status) >=0) { |
| endCodePoint = SpoofImpl::ScanHex( |
| input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); |
| } |
| |
| // Extract the two script names from the source line. We need these in an 8 bit |
| // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on |
| // to the ICU u_getPropertyValueEnum() function. Ugh. |
| char srcScriptName[20]; |
| char targScriptName[20]; |
| extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); |
| extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); |
| UScriptCode srcScript = |
| static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); |
| UScriptCode targScript = |
| static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); |
| if (U_FAILURE(status)) { |
| goto cleanup; |
| } |
| if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { |
| status = U_INVALID_FORMAT_ERROR; |
| goto cleanup; |
| } |
| |
| // select the table - (A) any case or (L) lower case only |
| UTrie2 *table = anyCaseTrie; |
| if (uregex_start(parseRegexp, 7, &status) >= 0) { |
| table = lowerCaseTrie; |
| } |
| |
| // Build the set of scripts containing confusable characters for |
| // the code point(s) specified in this input line. |
| // Sanity check that the script of the source code point is the same |
| // as the source script indicated in the input file. Failure of this check is |
| // an error in the input file. |
| // Include the source script in the set (needed for Mixed Script Confusable detection). |
| // |
| UChar32 cp; |
| for (cp=startCodePoint; cp<=endCodePoint; cp++) { |
| int32_t setIndex = utrie2_get32(table, cp); |
| BuilderScriptSet *bsset = NULL; |
| if (setIndex > 0) { |
| U_ASSERT(setIndex < scriptSets->size()); |
| bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); |
| } else { |
| bsset = new BuilderScriptSet(); |
| if (bsset == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| goto cleanup; |
| } |
| bsset->codePoint = cp; |
| bsset->trie = table; |
| bsset->sset = new ScriptSet(); |
| setIndex = scriptSets->size(); |
| bsset->index = setIndex; |
| bsset->rindex = 0; |
| if (bsset->sset == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| goto cleanup; |
| } |
| scriptSets->addElement(bsset, status); |
| utrie2_set32(table, cp, setIndex, &status); |
| } |
| bsset->sset->Union(targScript); |
| bsset->sset->Union(srcScript); |
| |
| if (U_FAILURE(status)) { |
| goto cleanup; |
| } |
| UScriptCode cpScript = uscript_getScript(cp, &status); |
| if (cpScript != srcScript) { |
| status = U_INVALID_FORMAT_ERROR; |
| goto cleanup; |
| } |
| } |
| } |
| |
| // Eliminate duplicate script sets. At this point we have a separate |
| // script set for every code point that had data in the input file. |
| // |
| // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them |
| // |
| // printf("Number of scriptSets: %d\n", scriptSets->size()); |
| { |
| int32_t duplicateCount = 0; |
| rtScriptSetsCount = 2; |
| for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { |
| BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); |
| if (outerSet->index != static_cast<uint32_t>(outeri)) { |
| // This set was already identified as a duplicate. |
| // It will not be allocated a position in the runtime array of ScriptSets. |
| continue; |
| } |
| outerSet->rindex = rtScriptSetsCount++; |
| for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { |
| BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); |
| if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { |
| delete innerSet->sset; |
| innerSet->scriptSetOwned = FALSE; |
| innerSet->sset = outerSet->sset; |
| innerSet->index = outeri; |
| innerSet->rindex = outerSet->rindex; |
| duplicateCount++; |
| } |
| // But this doesn't get all. We need to fix the TRIE. |
| } |
| } |
| // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); |
| } |
| |
| |
| |
| // Update the Trie values to be reflect the run time script indexes (after duplicate merging). |
| // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets |
| // are unused, which is why the loop index starts at 2.) |
| { |
| for (int32_t i=2; i<scriptSets->size(); i++) { |
| BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
| if (bSet->rindex != (uint32_t)i) { |
| utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); |
| } |
| } |
| } |
| |
| // For code points with script==Common or script==Inherited, |
| // Set the reserved value of 1 into both Tries. These characters do not participate |
| // in Whole Script Confusable detection; this reserved value is the means |
| // by which they are detected. |
| { |
| UnicodeSet ignoreSet; |
| ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); |
| UnicodeSet inheritedSet; |
| inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); |
| ignoreSet.addAll(inheritedSet); |
| for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { |
| UChar32 rangeStart = ignoreSet.getRangeStart(rn); |
| UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); |
| utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); |
| utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); |
| } |
| } |
| |
| // Serialize the data to the Spoof Detector |
| { |
| utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
| int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); |
| // printf("Any case Trie size: %d\n", size); |
| if (status != U_BUFFER_OVERFLOW_ERROR) { |
| goto cleanup; |
| } |
| status = U_ZERO_ERROR; |
| spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; |
| spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; |
| spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; |
| void *where = spImpl->fSpoofData->reserveSpace(size, status); |
| utrie2_serialize(anyCaseTrie, where, size, &status); |
| |
| utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
| size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); |
| // printf("Lower case Trie size: %d\n", size); |
| if (status != U_BUFFER_OVERFLOW_ERROR) { |
| goto cleanup; |
| } |
| status = U_ZERO_ERROR; |
| spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; |
| spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; |
| spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; |
| where = spImpl->fSpoofData->reserveSpace(size, status); |
| utrie2_serialize(lowerCaseTrie, where, size, &status); |
| |
| spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; |
| spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; |
| ScriptSet *rtScriptSets = static_cast<ScriptSet *> |
| (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); |
| uint32_t rindex = 2; |
| for (int32_t i=2; i<scriptSets->size(); i++) { |
| BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
| if (bSet->rindex < rindex) { |
| // We have already copied this script set to the serialized data. |
| continue; |
| } |
| U_ASSERT(rindex == bSet->rindex); |
| rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. |
| rindex++; |
| } |
| } |
| |
| // Open new utrie2s from the serialized data. We don't want to keep the ones |
| // we just built because we would then have two copies of the data, one internal to |
| // the utries that we have already constructed, and one in the serialized data area. |
| // An alternative would be to not pre-serialize the Trie data, but that makes the |
| // spoof detector data different, depending on how the detector was constructed. |
| // It's simpler to keep the data always the same. |
| |
| spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( |
| UTRIE2_16_VALUE_BITS, |
| (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, |
| spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
| NULL, |
| &status); |
| |
| spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( |
| UTRIE2_16_VALUE_BITS, |
| (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, |
| spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
| NULL, |
| &status); |
| |
| |
| |
| cleanup: |
| if (U_FAILURE(status)) { |
| pe->line = lineNum; |
| } |
| uregex_close(parseRegexp); |
| uprv_free(input); |
| |
| int32_t i; |
| for (i=0; i<scriptSets->size(); i++) { |
| BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); |
| delete bsset; |
| } |
| delete scriptSets; |
| utrie2_close(anyCaseTrie); |
| utrie2_close(lowerCaseTrie); |
| return; |
| } |
| |
| |
| |
| |
| |
| BuilderScriptSet::BuilderScriptSet() { |
| codePoint = -1; |
| trie = NULL; |
| sset = NULL; |
| index = 0; |
| rindex = 0; |
| scriptSetOwned = TRUE; |
| } |
| |
| BuilderScriptSet::~BuilderScriptSet() { |
| if (scriptSetOwned) { |
| delete sset; |
| } |
| } |
| |
| #endif |
| #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| |