| /* |
| ********************************************************************** |
| * Copyright (C) 2012-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| #include "starboard/client_porting/poem/string_poem.h" |
| #include "unicode/utypes.h" |
| |
| #include "unicode/uchar.h" |
| #include "unicode/utf16.h" |
| |
| #include "identifier_info.h" |
| #include "mutex.h" |
| #include "scriptset.h" |
| #include "ucln_in.h" |
| #include "uvector.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| static UnicodeSet *ASCII; |
| static ScriptSet *JAPANESE; |
| static ScriptSet *CHINESE; |
| static ScriptSet *KOREAN; |
| static ScriptSet *CONFUSABLE_WITH_LATIN; |
| static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; |
| |
| |
| U_CDECL_BEGIN |
| static UBool U_CALLCONV |
| IdentifierInfo_cleanup(void) { |
| delete ASCII; |
| ASCII = NULL; |
| delete JAPANESE; |
| JAPANESE = NULL; |
| delete CHINESE; |
| CHINESE = NULL; |
| delete KOREAN; |
| KOREAN = NULL; |
| delete CONFUSABLE_WITH_LATIN; |
| CONFUSABLE_WITH_LATIN = NULL; |
| gIdentifierInfoInitOnce.reset(); |
| return TRUE; |
| } |
| |
| static void U_CALLCONV |
| IdentifierInfo_init(UErrorCode &status) { |
| ASCII = new UnicodeSet(0, 0x7f); |
| JAPANESE = new ScriptSet(); |
| CHINESE = new ScriptSet(); |
| KOREAN = new ScriptSet(); |
| CONFUSABLE_WITH_LATIN = new ScriptSet(); |
| if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL |
| || CONFUSABLE_WITH_LATIN == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| ASCII->freeze(); |
| JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) |
| .set(USCRIPT_KATAKANA, status); |
| CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); |
| KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); |
| CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) |
| .set(USCRIPT_CHEROKEE, status); |
| ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); |
| } |
| U_CDECL_END |
| |
| |
| IdentifierInfo::IdentifierInfo(UErrorCode &status): |
| fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), |
| fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { |
| umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| fIdentifier = new UnicodeString(); |
| fRequiredScripts = new ScriptSet(); |
| fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); |
| uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); |
| fCommonAmongAlternates = new ScriptSet(); |
| fNumerics = new UnicodeSet(); |
| fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); |
| |
| if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || |
| fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| |
| IdentifierInfo::~IdentifierInfo() { |
| delete fIdentifier; |
| delete fRequiredScripts; |
| uhash_close(fScriptSetSet); |
| delete fCommonAmongAlternates; |
| delete fNumerics; |
| delete fIdentifierProfile; |
| } |
| |
| |
| IdentifierInfo &IdentifierInfo::clear() { |
| fRequiredScripts->resetAll(); |
| uhash_removeAll(fScriptSetSet); |
| fNumerics->clear(); |
| fCommonAmongAlternates->resetAll(); |
| return *this; |
| } |
| |
| |
| IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { |
| *fIdentifierProfile = identifierProfile; |
| return *this; |
| } |
| |
| |
| const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { |
| return *fIdentifierProfile; |
| } |
| |
| |
| IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| *fIdentifier = identifier; |
| clear(); |
| ScriptSet scriptsForCP; |
| UChar32 cp; |
| for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { |
| cp = identifier.char32At(i); |
| // Store a representative character for each kind of decimal digit |
| if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { |
| // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value |
| fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); |
| } |
| UScriptCode extensions[500]; |
| int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| scriptsForCP.resetAll(); |
| for (int32_t j=0; j<extensionsCount; j++) { |
| scriptsForCP.set(extensions[j], status); |
| } |
| scriptsForCP.reset(USCRIPT_COMMON, status); |
| scriptsForCP.reset(USCRIPT_INHERITED, status); |
| switch (scriptsForCP.countMembers()) { |
| case 0: break; |
| case 1: |
| // Single script, record it. |
| fRequiredScripts->Union(scriptsForCP); |
| break; |
| default: |
| if (!fRequiredScripts->intersects(scriptsForCP) |
| && !uhash_geti(fScriptSetSet, &scriptsForCP)) { |
| // If the set hasn't been added already, add it |
| // (Add a copy, fScriptSetSet takes ownership of the copy.) |
| uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); |
| } |
| break; |
| } |
| } |
| // Now make a final pass through ScriptSetSet to remove alternates that came before singles. |
| // [Kana], [Kana Hira] => [Kana] |
| // This is relatively infrequent, so doesn't have to be optimized. |
| // We also compute any commonalities among the alternates. |
| if (uhash_count(fScriptSetSet) > 0) { |
| fCommonAmongAlternates->setAll(); |
| for (int32_t it = UHASH_FIRST;;) { |
| const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); |
| if (nextHashEl == NULL) { |
| break; |
| } |
| ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); |
| // [Kana], [Kana Hira] => [Kana] |
| if (fRequiredScripts->intersects(*next)) { |
| uhash_removeElement(fScriptSetSet, nextHashEl); |
| } else { |
| fCommonAmongAlternates->intersect(*next); |
| // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] |
| for (int32_t otherIt = UHASH_FIRST;;) { |
| const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); |
| if (otherHashEl == NULL) { |
| break; |
| } |
| ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); |
| if (next != other && next->contains(*other)) { |
| uhash_removeElement(fScriptSetSet, nextHashEl); |
| break; |
| } |
| } |
| } |
| } |
| } |
| if (uhash_count(fScriptSetSet) == 0) { |
| fCommonAmongAlternates->resetAll(); |
| } |
| return *this; |
| } |
| |
| |
| const UnicodeString *IdentifierInfo::getIdentifier() const { |
| return fIdentifier; |
| } |
| |
| const ScriptSet *IdentifierInfo::getScripts() const { |
| return fRequiredScripts; |
| } |
| |
| const UHashtable *IdentifierInfo::getAlternates() const { |
| return fScriptSetSet; |
| } |
| |
| |
| const UnicodeSet *IdentifierInfo::getNumerics() const { |
| return fNumerics; |
| } |
| |
| const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { |
| return fCommonAmongAlternates; |
| } |
| |
| #if !UCONFIG_NO_NORMALIZATION |
| |
| URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { |
| if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { |
| return USPOOF_UNRESTRICTIVE; |
| } |
| if (ASCII->containsAll(*fIdentifier)) { |
| return USPOOF_ASCII; |
| } |
| // This is a bit tricky. We look at a number of factors. |
| // The number of scripts in the text. |
| // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) |
| // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) |
| |
| // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the |
| // time it is created, in setIdentifier(). |
| int32_t cardinalityPlus = fRequiredScripts->countMembers() + |
| (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); |
| if (cardinalityPlus < 2) { |
| return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; |
| } |
| if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) |
| || containsWithAlternates(*KOREAN, *fRequiredScripts)) { |
| return USPOOF_HIGHLY_RESTRICTIVE; |
| } |
| if (cardinalityPlus == 2 && |
| fRequiredScripts->test(USCRIPT_LATIN, status) && |
| !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { |
| return USPOOF_MODERATELY_RESTRICTIVE; |
| } |
| return USPOOF_MINIMALLY_RESTRICTIVE; |
| } |
| |
| #endif /* !UCONFIG_NO_NORMALIZATION */ |
| |
| int32_t IdentifierInfo::getScriptCount() const { |
| // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. |
| int32_t count = fRequiredScripts->countMembers() + |
| (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); |
| return count; |
| } |
| |
| |
| |
| UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { |
| if (!container.contains(containee)) { |
| return FALSE; |
| } |
| for (int32_t iter = UHASH_FIRST; ;) { |
| const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); |
| if (hashEl == NULL) { |
| break; |
| } |
| ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); |
| if (!container.intersects(*alternatives)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { |
| UVector sorted(status); |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| for (int32_t pos = UHASH_FIRST; ;) { |
| const UHashElement *el = uhash_nextElement(alternates, &pos); |
| if (el == NULL) { |
| break; |
| } |
| ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); |
| sorted.addElement(ss, status); |
| } |
| sorted.sort(uhash_compareScriptSet, status); |
| UnicodeString separator = UNICODE_STRING_SIMPLE("; "); |
| for (int32_t i=0; i<sorted.size(); i++) { |
| if (i>0) { |
| dest.append(separator); |
| } |
| ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); |
| ss->displayScripts(dest); |
| } |
| return dest; |
| } |
| |
| U_NAMESPACE_END |
| |