| // © 2017 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_FORMATTING |
| |
| #include "number_affixutils.h" |
| #include "unicode/utf16.h" |
| #include "unicode/uniset.h" |
| |
| using namespace icu; |
| using namespace icu::number; |
| using namespace icu::number::impl; |
| |
| TokenConsumer::~TokenConsumer() = default; |
| SymbolProvider::~SymbolProvider() = default; |
| |
| int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) { |
| AffixPatternState state = STATE_BASE; |
| int32_t offset = 0; |
| int32_t length = 0; |
| for (; offset < patternString.length();) { |
| UChar32 cp = patternString.char32At(offset); |
| |
| switch (state) { |
| case STATE_BASE: |
| if (cp == u'\'') { |
| // First quote |
| state = STATE_FIRST_QUOTE; |
| } else { |
| // Unquoted symbol |
| length++; |
| } |
| break; |
| case STATE_FIRST_QUOTE: |
| if (cp == u'\'') { |
| // Repeated quote |
| length++; |
| state = STATE_BASE; |
| } else { |
| // Quoted code point |
| length++; |
| state = STATE_INSIDE_QUOTE; |
| } |
| break; |
| case STATE_INSIDE_QUOTE: |
| if (cp == u'\'') { |
| // End of quoted sequence |
| state = STATE_AFTER_QUOTE; |
| } else { |
| // Quoted code point |
| length++; |
| } |
| break; |
| case STATE_AFTER_QUOTE: |
| if (cp == u'\'') { |
| // Double quote inside of quoted sequence |
| length++; |
| state = STATE_INSIDE_QUOTE; |
| } else { |
| // Unquoted symbol |
| length++; |
| } |
| break; |
| default: |
| UPRV_UNREACHABLE; |
| } |
| |
| offset += U16_LENGTH(cp); |
| } |
| |
| switch (state) { |
| case STATE_FIRST_QUOTE: |
| case STATE_INSIDE_QUOTE: |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| break; |
| default: |
| break; |
| } |
| |
| return length; |
| } |
| |
| UnicodeString AffixUtils::escape(const UnicodeString &input) { |
| AffixPatternState state = STATE_BASE; |
| int32_t offset = 0; |
| UnicodeString output; |
| for (; offset < input.length();) { |
| UChar32 cp = input.char32At(offset); |
| |
| switch (cp) { |
| case u'\'': |
| output.append(u"''", -1); |
| break; |
| |
| case u'-': |
| case u'+': |
| case u'%': |
| case u'‰': |
| case u'¤': |
| if (state == STATE_BASE) { |
| output.append(u'\''); |
| output.append(cp); |
| state = STATE_INSIDE_QUOTE; |
| } else { |
| output.append(cp); |
| } |
| break; |
| |
| default: |
| if (state == STATE_INSIDE_QUOTE) { |
| output.append(u'\''); |
| output.append(cp); |
| state = STATE_BASE; |
| } else { |
| output.append(cp); |
| } |
| break; |
| } |
| offset += U16_LENGTH(cp); |
| } |
| |
| if (state == STATE_INSIDE_QUOTE) { |
| output.append(u'\''); |
| } |
| |
| return output; |
| } |
| |
| Field AffixUtils::getFieldForType(AffixPatternType type) { |
| switch (type) { |
| case TYPE_MINUS_SIGN: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; |
| case TYPE_PLUS_SIGN: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; |
| case TYPE_PERCENT: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD}; |
| case TYPE_PERMILLE: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD}; |
| case TYPE_CURRENCY_SINGLE: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
| case TYPE_CURRENCY_DOUBLE: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
| case TYPE_CURRENCY_TRIPLE: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
| case TYPE_CURRENCY_QUAD: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
| case TYPE_CURRENCY_QUINT: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
| case TYPE_CURRENCY_OVERFLOW: |
| return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
| default: |
| UPRV_UNREACHABLE; |
| } |
| } |
| |
| int32_t |
| AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position, |
| const SymbolProvider &provider, Field field, UErrorCode &status) { |
| int32_t length = 0; |
| AffixTag tag; |
| while (hasNext(tag, affixPattern)) { |
| tag = nextToken(tag, affixPattern, status); |
| if (U_FAILURE(status)) { return length; } |
| if (tag.type == TYPE_CURRENCY_OVERFLOW) { |
| // Don't go to the provider for this special case |
| length += output.insertCodePoint( |
| position + length, |
| 0xFFFD, |
| {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, |
| status); |
| } else if (tag.type < 0) { |
| length += output.insert( |
| position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status); |
| } else { |
| length += output.insertCodePoint(position + length, tag.codePoint, field, status); |
| } |
| } |
| return length; |
| } |
| |
| int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern, |
| const SymbolProvider &provider, UErrorCode &status) { |
| int32_t length = 0; |
| AffixTag tag; |
| while (hasNext(tag, affixPattern)) { |
| tag = nextToken(tag, affixPattern, status); |
| if (U_FAILURE(status)) { return length; } |
| if (tag.type == TYPE_CURRENCY_OVERFLOW) { |
| length += 1; |
| } else if (tag.type < 0) { |
| length += provider.getSymbol(tag.type).length(); |
| } else { |
| length += U16_LENGTH(tag.codePoint); |
| } |
| } |
| return length; |
| } |
| |
| bool |
| AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) { |
| if (affixPattern.length() == 0) { |
| return false; |
| } |
| AffixTag tag; |
| while (hasNext(tag, affixPattern)) { |
| tag = nextToken(tag, affixPattern, status); |
| if (U_FAILURE(status)) { return false; } |
| if (tag.type == type) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) { |
| if (affixPattern.length() == 0) { |
| return false; |
| } |
| AffixTag tag; |
| while (hasNext(tag, affixPattern)) { |
| tag = nextToken(tag, affixPattern, status); |
| if (U_FAILURE(status)) { return false; } |
| if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type, |
| char16_t replacementChar, UErrorCode &status) { |
| UnicodeString output(affixPattern); // copy |
| if (affixPattern.length() == 0) { |
| return output; |
| } |
| AffixTag tag; |
| while (hasNext(tag, affixPattern)) { |
| tag = nextToken(tag, affixPattern, status); |
| if (U_FAILURE(status)) { return output; } |
| if (tag.type == type) { |
| output.replace(tag.offset - 1, 1, replacementChar); |
| } |
| } |
| return output; |
| } |
| |
| bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern, |
| const UnicodeSet& ignorables, UErrorCode& status) { |
| if (affixPattern.length() == 0) { |
| return true; |
| } |
| AffixTag tag; |
| while (hasNext(tag, affixPattern)) { |
| tag = nextToken(tag, affixPattern, status); |
| if (U_FAILURE(status)) { return false; } |
| if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer, |
| UErrorCode& status) { |
| if (affixPattern.length() == 0) { |
| return; |
| } |
| AffixTag tag; |
| while (hasNext(tag, affixPattern)) { |
| tag = nextToken(tag, affixPattern, status); |
| if (U_FAILURE(status)) { return; } |
| consumer.consumeToken(tag.type, tag.codePoint, status); |
| if (U_FAILURE(status)) { return; } |
| } |
| } |
| |
| AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) { |
| int32_t offset = tag.offset; |
| int32_t state = tag.state; |
| for (; offset < patternString.length();) { |
| UChar32 cp = patternString.char32At(offset); |
| int32_t count = U16_LENGTH(cp); |
| |
| switch (state) { |
| case STATE_BASE: |
| switch (cp) { |
| case u'\'': |
| state = STATE_FIRST_QUOTE; |
| offset += count; |
| // continue to the next code point |
| break; |
| case u'-': |
| return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0); |
| case u'+': |
| return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0); |
| case u'%': |
| return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0); |
| case u'‰': |
| return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0); |
| case u'¤': |
| state = STATE_FIRST_CURR; |
| offset += count; |
| // continue to the next code point |
| break; |
| default: |
| return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); |
| } |
| break; |
| case STATE_FIRST_QUOTE: |
| if (cp == u'\'') { |
| return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); |
| } else { |
| return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); |
| } |
| case STATE_INSIDE_QUOTE: |
| if (cp == u'\'') { |
| state = STATE_AFTER_QUOTE; |
| offset += count; |
| // continue to the next code point |
| break; |
| } else { |
| return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); |
| } |
| case STATE_AFTER_QUOTE: |
| if (cp == u'\'') { |
| return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); |
| } else { |
| state = STATE_BASE; |
| // re-evaluate this code point |
| break; |
| } |
| case STATE_FIRST_CURR: |
| if (cp == u'¤') { |
| state = STATE_SECOND_CURR; |
| offset += count; |
| // continue to the next code point |
| break; |
| } else { |
| return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); |
| } |
| case STATE_SECOND_CURR: |
| if (cp == u'¤') { |
| state = STATE_THIRD_CURR; |
| offset += count; |
| // continue to the next code point |
| break; |
| } else { |
| return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); |
| } |
| case STATE_THIRD_CURR: |
| if (cp == u'¤') { |
| state = STATE_FOURTH_CURR; |
| offset += count; |
| // continue to the next code point |
| break; |
| } else { |
| return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); |
| } |
| case STATE_FOURTH_CURR: |
| if (cp == u'¤') { |
| state = STATE_FIFTH_CURR; |
| offset += count; |
| // continue to the next code point |
| break; |
| } else { |
| return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); |
| } |
| case STATE_FIFTH_CURR: |
| if (cp == u'¤') { |
| state = STATE_OVERFLOW_CURR; |
| offset += count; |
| // continue to the next code point |
| break; |
| } else { |
| return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); |
| } |
| case STATE_OVERFLOW_CURR: |
| if (cp == u'¤') { |
| offset += count; |
| // continue to the next code point and loop back to this state |
| break; |
| } else { |
| return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); |
| } |
| default: |
| UPRV_UNREACHABLE; |
| } |
| } |
| // End of string |
| switch (state) { |
| case STATE_BASE: |
| // No more tokens in string. |
| return {-1}; |
| case STATE_FIRST_QUOTE: |
| case STATE_INSIDE_QUOTE: |
| // For consistent behavior with the JDK and ICU 58, set an error here. |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return {-1}; |
| case STATE_AFTER_QUOTE: |
| // No more tokens in string. |
| return {-1}; |
| case STATE_FIRST_CURR: |
| return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); |
| case STATE_SECOND_CURR: |
| return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); |
| case STATE_THIRD_CURR: |
| return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); |
| case STATE_FOURTH_CURR: |
| return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); |
| case STATE_FIFTH_CURR: |
| return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); |
| case STATE_OVERFLOW_CURR: |
| return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); |
| default: |
| UPRV_UNREACHABLE; |
| } |
| } |
| |
| bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) { |
| // First check for the {-1} and default initializer syntax. |
| if (tag.offset < 0) { |
| return false; |
| } else if (tag.offset == 0) { |
| return string.length() > 0; |
| } |
| // The rest of the fields are safe to use now. |
| // Special case: the last character in string is an end quote. |
| if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 && |
| string.charAt(tag.offset) == u'\'') { |
| return false; |
| } else if (tag.state != STATE_BASE) { |
| return true; |
| } else { |
| return tag.offset < string.length(); |
| } |
| } |
| |
| #endif /* #if !UCONFIG_NO_FORMATTING */ |