| // © 2019 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| |
| // locdistance.cpp |
| // created: 2019may08 Markus W. Scherer |
| |
| #include "unicode/utypes.h" |
| #include "unicode/bytestrie.h" |
| #include "unicode/localematcher.h" |
| #include "unicode/locid.h" |
| #include "unicode/uobject.h" |
| #include "unicode/ures.h" |
| #include "cstring.h" |
| #include "locdistance.h" |
| #include "loclikelysubtags.h" |
| #include "uassert.h" |
| #include "ucln_cmn.h" |
| #include "uinvchar.h" |
| #include "umutex.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| namespace { |
| |
| /** |
| * Bit flag used on the last character of a subtag in the trie. |
| * Must be set consistently by the builder and the lookup code. |
| */ |
| constexpr int32_t END_OF_SUBTAG = 0x80; |
| /** Distance value bit flag, set by the builder. */ |
| constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; |
| /** Distance value bit flag, set by trieNext(). */ |
| constexpr int32_t DISTANCE_IS_FINAL = 0x100; |
| constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; |
| |
| constexpr int32_t ABOVE_THRESHOLD = 100; |
| |
| // Indexes into array of distances. |
| enum { |
| IX_DEF_LANG_DISTANCE, |
| IX_DEF_SCRIPT_DISTANCE, |
| IX_DEF_REGION_DISTANCE, |
| IX_MIN_REGION_DISTANCE, |
| IX_LIMIT |
| }; |
| |
| LocaleDistance *gLocaleDistance = nullptr; |
| UInitOnce gInitOnce = U_INITONCE_INITIALIZER; |
| |
| UBool U_CALLCONV cleanup() { |
| delete gLocaleDistance; |
| gLocaleDistance = nullptr; |
| gInitOnce.reset(); |
| return TRUE; |
| } |
| |
| } // namespace |
| |
| void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { |
| // This function is invoked only via umtx_initOnce(). |
| U_ASSERT(gLocaleDistance == nullptr); |
| const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); |
| if (U_FAILURE(errorCode)) { return; } |
| const LocaleDistanceData &data = likely.getDistanceData(); |
| if (data.distanceTrieBytes == nullptr || |
| data.regionToPartitions == nullptr || data.partitions == nullptr || |
| // ok if no paradigms |
| data.distances == nullptr) { |
| errorCode = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| gLocaleDistance = new LocaleDistance(data, likely); |
| if (gLocaleDistance == nullptr) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); |
| } |
| |
| const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { return nullptr; } |
| umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); |
| return gLocaleDistance; |
| } |
| |
| LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) : |
| likelySubtags(likely), |
| trie(data.distanceTrieBytes), |
| regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), |
| paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), |
| defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), |
| defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), |
| defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), |
| minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { |
| // For the default demotion value, use the |
| // default region distance between unrelated Englishes. |
| // Thus, unless demotion is turned off, |
| // a mere region difference for one desired locale |
| // is as good as a perfect match for the next following desired locale. |
| // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. |
| LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR); |
| LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR); |
| const LSR *p_enGB = &enGB; |
| int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1, |
| shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY); |
| defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); |
| } |
| |
| int32_t LocaleDistance::getBestIndexAndDistance( |
| const LSR &desired, |
| const LSR **supportedLSRs, int32_t supportedLSRsLength, |
| int32_t shiftedThreshold, |
| ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const { |
| BytesTrie iter(trie); |
| // Look up the desired language only once for all supported LSRs. |
| // Its "distance" is either a match point value of 0, or a non-match negative value. |
| // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
| int32_t desLangDistance = trieNext(iter, desired.language, false); |
| uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; |
| // Index of the supported LSR with the lowest distance. |
| int32_t bestIndex = -1; |
| // Cached lookup info from XLikelySubtags.compareLikely(). |
| int32_t bestLikelyInfo = -1; |
| for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { |
| const LSR &supported = *supportedLSRs[slIndex]; |
| bool star = false; |
| int32_t distance = desLangDistance; |
| if (distance >= 0) { |
| U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); |
| if (slIndex != 0) { |
| iter.resetToState64(desLangState); |
| } |
| distance = trieNext(iter, supported.language, true); |
| } |
| // Note: The data builder verifies that there are no rules with "any" (*) language and |
| // real (non *) script or region subtags. |
| // This means that if the lookup for either language fails we can use |
| // the default distances without further lookups. |
| int32_t flags; |
| if (distance >= 0) { |
| flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
| distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
| } else { // <*, *> |
| if (uprv_strcmp(desired.language, supported.language) == 0) { |
| distance = 0; |
| } else { |
| distance = defaultLanguageDistance; |
| } |
| flags = 0; |
| star = true; |
| } |
| U_ASSERT(0 <= distance && distance <= 100); |
| // Round up the shifted threshold (if fraction bits are not 0) |
| // for comparison with un-shifted distances until we need fraction bits. |
| // (If we simply shifted non-zero fraction bits away, then we might ignore a language |
| // when it's really still a micro distance below the threshold.) |
| int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; |
| // We implement "favor subtag" by reducing the language subtag distance |
| // (unscientifically reducing it to a quarter of the normal value), |
| // so that the script distance is relatively more important. |
| // For example, given a default language distance of 80, we reduce it to 20, |
| // which is below the default threshold of 50, which is the default script distance. |
| if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { |
| distance >>= 2; |
| } |
| // Let distance == roundedThreshold pass until the tie-breaker logic |
| // at the end of the loop. |
| if (distance > roundedThreshold) { |
| continue; |
| } |
| |
| int32_t scriptDistance; |
| if (star || flags != 0) { |
| if (uprv_strcmp(desired.script, supported.script) == 0) { |
| scriptDistance = 0; |
| } else { |
| scriptDistance = defaultScriptDistance; |
| } |
| } else { |
| scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), |
| desired.script, supported.script); |
| flags = scriptDistance & DISTANCE_IS_FINAL; |
| scriptDistance &= ~DISTANCE_IS_FINAL; |
| } |
| distance += scriptDistance; |
| if (distance > roundedThreshold) { |
| continue; |
| } |
| |
| if (uprv_strcmp(desired.region, supported.region) == 0) { |
| // regionDistance = 0 |
| } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { |
| distance += defaultRegionDistance; |
| } else { |
| int32_t remainingThreshold = roundedThreshold - distance; |
| if (minRegionDistance > remainingThreshold) { |
| continue; |
| } |
| |
| // From here on we know the regions are not equal. |
| // Map each region to zero or more partitions. (zero = one non-matching string) |
| // (Each array of single-character partition strings is encoded as one string.) |
| // If either side has more than one, then we find the maximum distance. |
| // This could be optimized by adding some more structure, but probably not worth it. |
| distance += getRegionPartitionsDistance( |
| iter, iter.getState64(), |
| partitionsForRegion(desired), |
| partitionsForRegion(supported), |
| remainingThreshold); |
| } |
| int32_t shiftedDistance = shiftDistance(distance); |
| if (shiftedDistance == 0) { |
| // Distinguish between equivalent but originally unequal locales via an |
| // additional micro distance. |
| shiftedDistance |= (desired.flags ^ supported.flags); |
| if (shiftedDistance < shiftedThreshold) { |
| if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || |
| // Is there also a match when we swap desired/supported? |
| isMatch(supported, desired, shiftedThreshold, favorSubtag)) { |
| if (shiftedDistance == 0) { |
| return slIndex << INDEX_SHIFT; |
| } |
| bestIndex = slIndex; |
| shiftedThreshold = shiftedDistance; |
| bestLikelyInfo = -1; |
| } |
| } |
| } else { |
| if (shiftedDistance < shiftedThreshold) { |
| if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || |
| // Is there also a match when we swap desired/supported? |
| isMatch(supported, desired, shiftedThreshold, favorSubtag)) { |
| bestIndex = slIndex; |
| shiftedThreshold = shiftedDistance; |
| bestLikelyInfo = -1; |
| } |
| } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { |
| if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || |
| // Is there also a match when we swap desired/supported? |
| isMatch(supported, desired, shiftedThreshold, favorSubtag)) { |
| bestLikelyInfo = likelySubtags.compareLikely( |
| supported, *supportedLSRs[bestIndex], bestLikelyInfo); |
| if ((bestLikelyInfo & 1) != 0) { |
| // This supported locale matches as well as the previous best match, |
| // and neither matches perfectly, |
| // but this one is "more likely" (has more-default subtags). |
| bestIndex = slIndex; |
| } |
| } |
| } |
| } |
| } |
| return bestIndex >= 0 ? |
| (bestIndex << INDEX_SHIFT) | shiftedThreshold : |
| INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); |
| } |
| |
| int32_t LocaleDistance::getDesSuppScriptDistance( |
| BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { |
| // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
| int32_t distance = trieNext(iter, desired, false); |
| if (distance >= 0) { |
| distance = trieNext(iter, supported, true); |
| } |
| if (distance < 0) { |
| UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> |
| U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
| if (uprv_strcmp(desired, supported) == 0) { |
| distance = 0; // same script |
| } else { |
| distance = iter.getValue(); |
| U_ASSERT(distance >= 0); |
| } |
| if (result == USTRINGTRIE_FINAL_VALUE) { |
| distance |= DISTANCE_IS_FINAL; |
| } |
| } |
| return distance; |
| } |
| |
| int32_t LocaleDistance::getRegionPartitionsDistance( |
| BytesTrie &iter, uint64_t startState, |
| const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { |
| char desired = *desiredPartitions++; |
| char supported = *supportedPartitions++; |
| U_ASSERT(desired != 0 && supported != 0); |
| // See if we have single desired/supported partitions, from NUL-terminated |
| // partition strings without explicit length. |
| bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character |
| // equivalent to: if (desLength == 1 && suppLength == 1) |
| if (*desiredPartitions == 0 && !suppLengthGt1) { |
| // Fastpath for single desired/supported partitions. |
| UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
| if (USTRINGTRIE_HAS_NEXT(result)) { |
| result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
| if (USTRINGTRIE_HAS_VALUE(result)) { |
| return iter.getValue(); |
| } |
| } |
| return getFallbackRegionDistance(iter, startState); |
| } |
| |
| const char *supportedStart = supportedPartitions - 1; // for restart of inner loop |
| int32_t regionDistance = 0; |
| // Fall back to * only once, not for each pair of partition strings. |
| bool star = false; |
| for (;;) { |
| // Look up each desired-partition string only once, |
| // not for each (desired, supported) pair. |
| UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
| if (USTRINGTRIE_HAS_NEXT(result)) { |
| uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; |
| for (;;) { |
| result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
| int32_t d; |
| if (USTRINGTRIE_HAS_VALUE(result)) { |
| d = iter.getValue(); |
| } else if (star) { |
| d = 0; |
| } else { |
| d = getFallbackRegionDistance(iter, startState); |
| star = true; |
| } |
| if (d > threshold) { |
| return d; |
| } else if (regionDistance < d) { |
| regionDistance = d; |
| } |
| if ((supported = *supportedPartitions++) != 0) { |
| iter.resetToState64(desState); |
| } else { |
| break; |
| } |
| } |
| } else if (!star) { |
| int32_t d = getFallbackRegionDistance(iter, startState); |
| if (d > threshold) { |
| return d; |
| } else if (regionDistance < d) { |
| regionDistance = d; |
| } |
| star = true; |
| } |
| if ((desired = *desiredPartitions++) != 0) { |
| iter.resetToState64(startState); |
| supportedPartitions = supportedStart; |
| supported = *supportedPartitions++; |
| } else { |
| break; |
| } |
| } |
| return regionDistance; |
| } |
| |
| int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { |
| #if U_DEBUG |
| UStringTrieResult result = |
| #endif |
| iter.resetToState64(startState).next(u'*'); // <*, *> |
| U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
| int32_t distance = iter.getValue(); |
| U_ASSERT(distance >= 0); |
| return distance; |
| } |
| |
| int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { |
| uint8_t c; |
| if ((c = *s) == 0) { |
| return -1; // no empty subtags in the distance data |
| } |
| for (;;) { |
| c = uprv_invCharToAscii(c); |
| // EBCDIC: If *s is not an invariant character, |
| // then c is now 0 and will simply not match anything, which is harmless. |
| uint8_t next = *++s; |
| if (next != 0) { |
| if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { |
| return -1; |
| } |
| } else { |
| // last character of this subtag |
| UStringTrieResult result = iter.next(c | END_OF_SUBTAG); |
| if (wantValue) { |
| if (USTRINGTRIE_HAS_VALUE(result)) { |
| int32_t value = iter.getValue(); |
| if (result == USTRINGTRIE_FINAL_VALUE) { |
| value |= DISTANCE_IS_FINAL; |
| } |
| return value; |
| } |
| } else { |
| if (USTRINGTRIE_HAS_NEXT(result)) { |
| return 0; |
| } |
| } |
| return -1; |
| } |
| c = next; |
| } |
| } |
| |
| UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { |
| // Linear search for a very short list (length 6 as of 2019), |
| // because we look for equivalence not equality, and |
| // because it's easy. |
| // If there are many paradigm LSRs we should use a hash set |
| // with custom comparator and hasher. |
| U_ASSERT(paradigmLSRsLength <= 15); |
| for (int32_t i = 0; i < paradigmLSRsLength; ++i) { |
| if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; } |
| } |
| return false; |
| } |
| |
| U_NAMESPACE_END |