| // © 2018 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_FORMATTING |
| |
| // Allow implicit conversion from char16_t* to UnicodeString for this file: |
| // Helpful in toString methods and elsewhere. |
| #define UNISTR_FROM_STRING_EXPLICIT |
| |
| #include "numparse_types.h" |
| #include "numparse_affixes.h" |
| #include "numparse_utils.h" |
| #include "number_utils.h" |
| #include "string_segment.h" |
| |
| using namespace icu; |
| using namespace icu::numparse; |
| using namespace icu::numparse::impl; |
| using namespace icu::number; |
| using namespace icu::number::impl; |
| |
| |
| namespace { |
| |
| /** |
| * Helper method to return whether the given AffixPatternMatcher equals the given pattern string. |
| * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal |
| * the given pattern string. |
| */ |
| static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) { |
| return (affix == nullptr && patternString.isBogus()) || |
| (affix != nullptr && affix->getPattern() == patternString); |
| } |
| |
| /** |
| * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null. |
| */ |
| static int32_t length(const AffixPatternMatcher* matcher) { |
| return matcher == nullptr ? 0 : matcher->getPattern().length(); |
| } |
| |
| /** |
| * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both |
| * valid, whether they are equal according to operator==. Similar to Java Objects.equals() |
| */ |
| static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) { |
| if (lhs == nullptr && rhs == nullptr) { |
| return true; |
| } |
| if (lhs == nullptr || rhs == nullptr) { |
| return false; |
| } |
| return *lhs == *rhs; |
| } |
| |
| } |
| |
| |
| AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern, |
| AffixTokenMatcherWarehouse& warehouse, |
| IgnorablesMatcher* ignorables) |
| : fMatchersLen(0), |
| fLastTypeOrCp(0), |
| fPattern(pattern), |
| fWarehouse(warehouse), |
| fIgnorables(ignorables) {} |
| |
| void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) { |
| // This is called by AffixUtils.iterateWithConsumer() for each token. |
| |
| // Add an ignorables matcher between tokens except between two literals, and don't put two |
| // ignorables matchers in a row. |
| if (fIgnorables != nullptr && fMatchersLen > 0 && |
| (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) { |
| addMatcher(*fIgnorables); |
| } |
| |
| if (type != TYPE_CODEPOINT) { |
| // Case 1: the token is a symbol. |
| switch (type) { |
| case TYPE_MINUS_SIGN: |
| addMatcher(fWarehouse.minusSign()); |
| break; |
| case TYPE_PLUS_SIGN: |
| addMatcher(fWarehouse.plusSign()); |
| break; |
| case TYPE_PERCENT: |
| addMatcher(fWarehouse.percent()); |
| break; |
| case TYPE_PERMILLE: |
| addMatcher(fWarehouse.permille()); |
| break; |
| case TYPE_CURRENCY_SINGLE: |
| case TYPE_CURRENCY_DOUBLE: |
| case TYPE_CURRENCY_TRIPLE: |
| case TYPE_CURRENCY_QUAD: |
| case TYPE_CURRENCY_QUINT: |
| // All currency symbols use the same matcher |
| addMatcher(fWarehouse.currency(status)); |
| break; |
| default: |
| UPRV_UNREACHABLE; |
| } |
| |
| } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) { |
| // Case 2: the token is an ignorable literal. |
| // No action necessary: the ignorables matcher has already been added. |
| |
| } else { |
| // Case 3: the token is a non-ignorable literal. |
| if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) { |
| addMatcher(*ptr); |
| } else { |
| // OOM; unwind the stack |
| return; |
| } |
| } |
| fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp; |
| } |
| |
| void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) { |
| if (fMatchersLen >= fMatchers.getCapacity()) { |
| fMatchers.resize(fMatchersLen * 2, fMatchersLen); |
| } |
| fMatchers[fMatchersLen++] = &matcher; |
| } |
| |
| AffixPatternMatcher AffixPatternMatcherBuilder::build(UErrorCode& status) { |
| return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern, status); |
| } |
| |
| AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData) |
| : fSetupData(setupData) {} |
| |
| NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() { |
| return fMinusSign = {fSetupData->dfs, true}; |
| } |
| |
| NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() { |
| return fPlusSign = {fSetupData->dfs, true}; |
| } |
| |
| NumberParseMatcher& AffixTokenMatcherWarehouse::percent() { |
| return fPercent = {fSetupData->dfs}; |
| } |
| |
| NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { |
| return fPermille = {fSetupData->dfs}; |
| } |
| |
| NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { |
| return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status}; |
| } |
| |
| IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() { |
| return fSetupData->ignorables; |
| } |
| |
| NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) { |
| if (U_FAILURE(status)) { |
| return nullptr; |
| } |
| auto* result = fCodePoints.create(cp); |
| if (result == nullptr) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| return result; |
| } |
| |
| |
| CodePointMatcher::CodePointMatcher(UChar32 cp) |
| : fCp(cp) {} |
| |
| bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const { |
| if (segment.startsWith(fCp)) { |
| segment.adjustOffsetByCodePoint(); |
| result.setCharsConsumed(segment); |
| } |
| return false; |
| } |
| |
| bool CodePointMatcher::smokeTest(const StringSegment& segment) const { |
| return segment.startsWith(fCp); |
| } |
| |
| UnicodeString CodePointMatcher::toString() const { |
| return u"<CodePoint>"; |
| } |
| |
| |
| AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern, |
| AffixTokenMatcherWarehouse& tokenWarehouse, |
| parse_flags_t parseFlags, bool* success, |
| UErrorCode& status) { |
| if (affixPattern.isEmpty()) { |
| *success = false; |
| return {}; |
| } |
| *success = true; |
| |
| IgnorablesMatcher* ignorables; |
| if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) { |
| ignorables = nullptr; |
| } else { |
| ignorables = &tokenWarehouse.ignorables(); |
| } |
| |
| AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables); |
| AffixUtils::iterateWithConsumer(affixPattern, builder, status); |
| return builder.build(status); |
| } |
| |
| AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen, |
| const UnicodeString& pattern, UErrorCode& status) |
| : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern, status) { |
| } |
| |
| UnicodeString AffixPatternMatcher::getPattern() const { |
| return fPattern.toAliasedUnicodeString(); |
| } |
| |
| bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const { |
| return fPattern == other.fPattern; |
| } |
| |
| |
| AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse) |
| : fTokenWarehouse(tokenWarehouse) { |
| } |
| |
| bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo, |
| const IgnorablesMatcher& ignorables, parse_flags_t parseFlags, |
| UErrorCode& status) { |
| UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX); |
| UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX); |
| UnicodeString negPrefixString; |
| UnicodeString negSuffixString; |
| if (patternInfo.hasNegativeSubpattern()) { |
| negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX); |
| negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX); |
| } |
| |
| if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) && |
| AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) && |
| AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) && |
| AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) && |
| AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status) |
| // HACK: Plus and minus sign are a special case: we accept them trailing only if they are |
| // trailing in the pattern string. |
| && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) && |
| !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) && |
| !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) && |
| !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) { |
| // The affixes contain only symbols and ignorables. |
| // No need to generate affix matchers. |
| return false; |
| } |
| return true; |
| } |
| |
| void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo, |
| MutableMatcherCollection& output, |
| const IgnorablesMatcher& ignorables, |
| parse_flags_t parseFlags, UErrorCode& status) { |
| if (!isInteresting(patternInfo, ignorables, parseFlags, status)) { |
| return; |
| } |
| |
| // The affixes have interesting characters, or we are in strict mode. |
| // Use initial capacity of 6, the highest possible number of AffixMatchers. |
| UnicodeString sb; |
| bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES); |
| |
| int32_t numAffixMatchers = 0; |
| int32_t numAffixPatternMatchers = 0; |
| |
| AffixPatternMatcher* posPrefix = nullptr; |
| AffixPatternMatcher* posSuffix = nullptr; |
| |
| // Pre-process the affix strings to resolve LDML rules like sign display. |
| for (int8_t typeInt = 0; typeInt < PATTERN_SIGN_TYPE_COUNT; typeInt++) { |
| auto type = static_cast<PatternSignType>(typeInt); |
| |
| // Skip affixes in some cases |
| if (type == PATTERN_SIGN_TYPE_POS |
| && 0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) { |
| continue; |
| } |
| if (type == PATTERN_SIGN_TYPE_POS_SIGN |
| && 0 == (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) { |
| continue; |
| } |
| |
| // Generate Prefix |
| bool hasPrefix = false; |
| PatternStringUtils::patternInfoToStringBuilder( |
| patternInfo, true, type, StandardPlural::OTHER, false, sb); |
| fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( |
| sb, *fTokenWarehouse, parseFlags, &hasPrefix, status); |
| AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++] |
| : nullptr; |
| |
| // Generate Suffix |
| bool hasSuffix = false; |
| PatternStringUtils::patternInfoToStringBuilder( |
| patternInfo, false, type, StandardPlural::OTHER, false, sb); |
| fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( |
| sb, *fTokenWarehouse, parseFlags, &hasSuffix, status); |
| AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++] |
| : nullptr; |
| |
| if (type == PATTERN_SIGN_TYPE_POS) { |
| posPrefix = prefix; |
| posSuffix = suffix; |
| } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) { |
| // Skip adding these matchers (we already have equivalents) |
| continue; |
| } |
| |
| // Flags for setting in the ParsedNumber; the token matchers may add more. |
| int flags = (type == PATTERN_SIGN_TYPE_NEG) ? FLAG_NEGATIVE : 0; |
| |
| // Note: it is indeed possible for posPrefix and posSuffix to both be null. |
| // We still need to add that matcher for strict mode to work. |
| fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags}; |
| if (includeUnpaired && prefix != nullptr && suffix != nullptr) { |
| // The following if statements are designed to prevent adding two identical matchers. |
| if (type == PATTERN_SIGN_TYPE_POS || !equals(prefix, posPrefix)) { |
| fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags}; |
| } |
| if (type == PATTERN_SIGN_TYPE_POS || !equals(suffix, posSuffix)) { |
| fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags}; |
| } |
| } |
| } |
| |
| // Put the AffixMatchers in order, and then add them to the output. |
| // Since there are at most 9 elements, do a simple-to-implement bubble sort. |
| bool madeChanges; |
| do { |
| madeChanges = false; |
| for (int32_t i = 1; i < numAffixMatchers; i++) { |
| if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) { |
| madeChanges = true; |
| AffixMatcher temp = std::move(fAffixMatchers[i - 1]); |
| fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]); |
| fAffixMatchers[i] = std::move(temp); |
| } |
| } |
| } while (madeChanges); |
| |
| for (int32_t i = 0; i < numAffixMatchers; i++) { |
| // Enable the following line to debug affixes |
| //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl; |
| output.addMatcher(fAffixMatchers[i]); |
| } |
| } |
| |
| |
| AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags) |
| : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {} |
| |
| bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { |
| if (!result.seenNumber()) { |
| // Prefix |
| // Do not match if: |
| // 1. We have already seen a prefix (result.prefix != null) |
| // 2. The prefix in this AffixMatcher is empty (prefix == null) |
| if (!result.prefix.isBogus() || fPrefix == nullptr) { |
| return false; |
| } |
| |
| // Attempt to match the prefix. |
| int initialOffset = segment.getOffset(); |
| bool maybeMore = fPrefix->match(segment, result, status); |
| if (initialOffset != segment.getOffset()) { |
| result.prefix = fPrefix->getPattern(); |
| } |
| return maybeMore; |
| |
| } else { |
| // Suffix |
| // Do not match if: |
| // 1. We have already seen a suffix (result.suffix != null) |
| // 2. The suffix in this AffixMatcher is empty (suffix == null) |
| // 3. The matched prefix does not equal this AffixMatcher's prefix |
| if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) { |
| return false; |
| } |
| |
| // Attempt to match the suffix. |
| int initialOffset = segment.getOffset(); |
| bool maybeMore = fSuffix->match(segment, result, status); |
| if (initialOffset != segment.getOffset()) { |
| result.suffix = fSuffix->getPattern(); |
| } |
| return maybeMore; |
| } |
| } |
| |
| bool AffixMatcher::smokeTest(const StringSegment& segment) const { |
| return (fPrefix != nullptr && fPrefix->smokeTest(segment)) || |
| (fSuffix != nullptr && fSuffix->smokeTest(segment)); |
| } |
| |
| void AffixMatcher::postProcess(ParsedNumber& result) const { |
| // Check to see if our affix is the one that was matched. If so, set the flags in the result. |
| if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) { |
| // Fill in the result prefix and suffix with non-null values (empty string). |
| // Used by strict mode to determine whether an entire affix pair was matched. |
| if (result.prefix.isBogus()) { |
| result.prefix = UnicodeString(); |
| } |
| if (result.suffix.isBogus()) { |
| result.suffix = UnicodeString(); |
| } |
| result.flags |= fFlags; |
| if (fPrefix != nullptr) { |
| fPrefix->postProcess(result); |
| } |
| if (fSuffix != nullptr) { |
| fSuffix->postProcess(result); |
| } |
| } |
| } |
| |
| int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const { |
| const AffixMatcher& lhs = *this; |
| if (length(lhs.fPrefix) != length(rhs.fPrefix)) { |
| return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1; |
| } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) { |
| return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1; |
| } else { |
| return 0; |
| } |
| } |
| |
| UnicodeString AffixMatcher::toString() const { |
| bool isNegative = 0 != (fFlags & FLAG_NEGATIVE); |
| return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") + |
| (fPrefix ? fPrefix->getPattern() : u"null") + u"#" + |
| (fSuffix ? fSuffix->getPattern() : u"null") + u">"; |
| |
| } |
| |
| |
| #endif /* #if !UCONFIG_NO_FORMATTING */ |