| /* |
| ************************************************************************** |
| * Copyright (C) 2002-2015 International Business Machines Corporation * |
| * and others. All rights reserved. * |
| ************************************************************************** |
| */ |
| // |
| // file: rematch.cpp |
| // |
| // Contains the implementation of class RegexMatcher, |
| // which is one of the main API classes for the ICU regular expression package. |
| // |
| |
| #include "unicode/utypes.h" |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| |
| #include "starboard/client_porting/poem/assert_poem.h" |
| #include "starboard/client_porting/poem/string_poem.h" |
| #include "unicode/regex.h" |
| #include "unicode/uniset.h" |
| #include "unicode/uchar.h" |
| #include "unicode/ustring.h" |
| #include "unicode/rbbi.h" |
| #include "unicode/utf.h" |
| #include "unicode/utf16.h" |
| #include "uassert.h" |
| #include "cmemory.h" |
| #include "uvector.h" |
| #include "uvectr32.h" |
| #include "uvectr64.h" |
| #include "regeximp.h" |
| #include "regexst.h" |
| #include "regextxt.h" |
| #include "ucase.h" |
| |
| // #include <malloc.h> // Needed for heapcheck testing |
| |
| U_NAMESPACE_BEGIN |
| |
| // Default limit for the size of the back track stack, to avoid system |
| // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. |
| // This value puts ICU's limits higher than most other regexp implementations, |
| // which use recursion rather than the heap, and take more storage per |
| // backtrack point. |
| // |
| static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; |
| |
| // Time limit counter constant. |
| // Time limits for expression evaluation are in terms of quanta of work by |
| // the engine, each of which is 10,000 state saves. |
| // This constant determines that state saves per tick number. |
| static const int32_t TIMER_INITIAL_VALUE = 10000; |
| |
| |
| // Test for any of the Unicode line terminating characters. |
| static inline UBool isLineTerminator(UChar32 c) { |
| if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { |
| return false; |
| } |
| return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // Constructor and Destructor |
| // |
| //----------------------------------------------------------------------------- |
| RegexMatcher::RegexMatcher(const RegexPattern *pat) { |
| fDeferredStatus = U_ZERO_ERROR; |
| init(fDeferredStatus); |
| if (U_FAILURE(fDeferredStatus)) { |
| return; |
| } |
| if (pat==NULL) { |
| fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| fPattern = pat; |
| init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); |
| } |
| |
| |
| |
| RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| fPattern = fPatternOwned; |
| |
| UText inputText = UTEXT_INITIALIZER; |
| utext_openConstUnicodeString(&inputText, &input, &status); |
| init2(&inputText, status); |
| utext_close(&inputText); |
| |
| fInputUniStrMaybeMutable = TRUE; |
| } |
| |
| |
| RegexMatcher::RegexMatcher(UText *regexp, UText *input, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| fPattern = fPatternOwned; |
| init2(input, status); |
| } |
| |
| |
| RegexMatcher::RegexMatcher(const UnicodeString ®exp, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fPattern = fPatternOwned; |
| init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| } |
| |
| RegexMatcher::RegexMatcher(UText *regexp, |
| uint32_t flags, UErrorCode &status) { |
| init(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| UParseError pe; |
| fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| fPattern = fPatternOwned; |
| init2(RegexStaticSets::gStaticSets->fEmptyText, status); |
| } |
| |
| |
| |
| |
| RegexMatcher::~RegexMatcher() { |
| delete fStack; |
| if (fData != fSmallData) { |
| uprv_free(fData); |
| fData = NULL; |
| } |
| if (fPatternOwned) { |
| delete fPatternOwned; |
| fPatternOwned = NULL; |
| fPattern = NULL; |
| } |
| |
| if (fInput) { |
| delete fInput; |
| } |
| if (fInputText) { |
| utext_close(fInputText); |
| } |
| if (fAltInputText) { |
| utext_close(fAltInputText); |
| } |
| |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| delete fWordBreakItr; |
| #endif |
| } |
| |
| // |
| // init() common initialization for use by all constructors. |
| // Initialize all fields, get the object into a consistent state. |
| // This must be done even when the initial status shows an error, |
| // so that the object is initialized sufficiently well for the destructor |
| // to run safely. |
| // |
| void RegexMatcher::init(UErrorCode &status) { |
| fPattern = NULL; |
| fPatternOwned = NULL; |
| fFrameSize = 0; |
| fRegionStart = 0; |
| fRegionLimit = 0; |
| fAnchorStart = 0; |
| fAnchorLimit = 0; |
| fLookStart = 0; |
| fLookLimit = 0; |
| fActiveStart = 0; |
| fActiveLimit = 0; |
| fTransparentBounds = FALSE; |
| fAnchoringBounds = TRUE; |
| fMatch = FALSE; |
| fMatchStart = 0; |
| fMatchEnd = 0; |
| fLastMatchEnd = -1; |
| fAppendPosition = 0; |
| fHitEnd = FALSE; |
| fRequireEnd = FALSE; |
| fStack = NULL; |
| fFrame = NULL; |
| fTimeLimit = 0; |
| fTime = 0; |
| fTickCounter = 0; |
| fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; |
| fCallbackFn = NULL; |
| fCallbackContext = NULL; |
| fFindProgressCallbackFn = NULL; |
| fFindProgressCallbackContext = NULL; |
| fTraceDebug = FALSE; |
| fDeferredStatus = status; |
| fData = fSmallData; |
| fWordBreakItr = NULL; |
| |
| fStack = NULL; |
| fInputText = NULL; |
| fAltInputText = NULL; |
| fInput = NULL; |
| fInputLength = 0; |
| fInputUniStrMaybeMutable = FALSE; |
| } |
| |
| // |
| // init2() Common initialization for use by RegexMatcher constructors, part 2. |
| // This handles the common setup to be done after the Pattern is available. |
| // |
| void RegexMatcher::init2(UText *input, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| fDeferredStatus = status; |
| return; |
| } |
| |
| if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { |
| fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); |
| if (fData == NULL) { |
| status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| } |
| |
| fStack = new UVector64(status); |
| if (fStack == NULL) { |
| status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| reset(input); |
| setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); |
| if (U_FAILURE(status)) { |
| fDeferredStatus = status; |
| return; |
| } |
| } |
| |
| |
| static const UChar BACKSLASH = 0x5c; |
| static const UChar DOLLARSIGN = 0x24; |
| static const UChar LEFTBRACKET = 0x7b; |
| static const UChar RIGHTBRACKET = 0x7d; |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // appendReplacement |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, |
| const UnicodeString &replacement, |
| UErrorCode &status) { |
| UText replacementText = UTEXT_INITIALIZER; |
| |
| utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| if (U_SUCCESS(status)) { |
| UText resultText = UTEXT_INITIALIZER; |
| utext_openUnicodeString(&resultText, &dest, &status); |
| |
| if (U_SUCCESS(status)) { |
| appendReplacement(&resultText, &replacementText, status); |
| utext_close(&resultText); |
| } |
| utext_close(&replacementText); |
| } |
| |
| return *this; |
| } |
| |
| // |
| // appendReplacement, UText mode |
| // |
| RegexMatcher &RegexMatcher::appendReplacement(UText *dest, |
| UText *replacement, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return *this; |
| } |
| if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| return *this; |
| } |
| |
| // Copy input string from the end of previous match to start of current match |
| int64_t destLen = utext_nativeLength(dest); |
| if (fMatchStart > fAppendPosition) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, |
| (int32_t)(fMatchStart-fAppendPosition), &status); |
| } else { |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)(fMatchStart-fAppendPosition); |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); |
| } |
| UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| if (inputChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return *this; |
| } |
| utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); |
| destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); |
| uprv_free(inputChars); |
| } |
| } |
| fAppendPosition = fMatchEnd; |
| |
| |
| // scan the replacement text, looking for substitutions ($n) and \escapes. |
| // TODO: optimize this loop by efficiently scanning for '$' or '\', |
| // move entire ranges not containing substitutions. |
| UTEXT_SETNATIVEINDEX(replacement, 0); |
| for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { |
| if (c == BACKSLASH) { |
| // Backslash Escape. Copy the following char out without further checks. |
| // Note: Surrogate pairs don't need any special handling |
| // The second half wont be a '$' or a '\', and |
| // will move to the dest normally on the next |
| // loop iteration. |
| c = UTEXT_CURRENT32(replacement); |
| if (c == U_SENTINEL) { |
| break; |
| } |
| |
| if (c==0x55/*U*/ || c==0x75/*u*/) { |
| // We have a \udddd or \Udddddddd escape sequence. |
| int32_t offset = 0; |
| struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); |
| UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); |
| if (escapedChar != (UChar32)0xFFFFFFFF) { |
| if (U_IS_BMP(escapedChar)) { |
| UChar c16 = (UChar)escapedChar; |
| destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); |
| } else { |
| UChar surrogate[2]; |
| surrogate[0] = U16_LEAD(escapedChar); |
| surrogate[1] = U16_TRAIL(escapedChar); |
| if (U_SUCCESS(status)) { |
| destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); |
| } |
| } |
| // TODO: Report errors for mal-formed \u escapes? |
| // As this is, the original sequence is output, which may be OK. |
| if (context.lastOffset == offset) { |
| (void)UTEXT_PREVIOUS32(replacement); |
| } else if (context.lastOffset != offset-1) { |
| utext_moveIndex32(replacement, offset - context.lastOffset - 1); |
| } |
| } |
| } else { |
| (void)UTEXT_NEXT32(replacement); |
| // Plain backslash escape. Just put out the escaped character. |
| if (U_IS_BMP(c)) { |
| UChar c16 = (UChar)c; |
| destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); |
| } else { |
| UChar surrogate[2]; |
| surrogate[0] = U16_LEAD(c); |
| surrogate[1] = U16_TRAIL(c); |
| if (U_SUCCESS(status)) { |
| destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); |
| } |
| } |
| } |
| } else if (c != DOLLARSIGN) { |
| // Normal char, not a $. Copy it out without further checks. |
| if (U_IS_BMP(c)) { |
| UChar c16 = (UChar)c; |
| destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); |
| } else { |
| UChar surrogate[2]; |
| surrogate[0] = U16_LEAD(c); |
| surrogate[1] = U16_TRAIL(c); |
| if (U_SUCCESS(status)) { |
| destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); |
| } |
| } |
| } else { |
| // We've got a $. Pick up a capture group name or number if one follows. |
| // Consume digits so long as the resulting group number <= the number of |
| // number of capture groups in the pattern. |
| |
| int32_t groupNum = 0; |
| int32_t numDigits = 0; |
| UChar32 nextChar = utext_current32(replacement); |
| if (nextChar == LEFTBRACKET) { |
| // Scan for a Named Capture Group, ${name}. |
| UnicodeString groupName; |
| utext_next32(replacement); |
| while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) { |
| nextChar = utext_next32(replacement); |
| if (nextChar == U_SENTINEL) { |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z |
| (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z |
| (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 |
| groupName.append(nextChar); |
| } else if (nextChar == RIGHTBRACKET) { |
| groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName); |
| if (groupNum == 0) { |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } |
| } else { |
| // Character was something other than a name char or a closing '}' |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } |
| } |
| |
| } else if (u_isdigit(nextChar)) { |
| // $n Scan for a capture group number |
| int32_t numCaptureGroups = fPattern->fGroupMap->size(); |
| for (;;) { |
| nextChar = UTEXT_CURRENT32(replacement); |
| if (nextChar == U_SENTINEL) { |
| break; |
| } |
| if (u_isdigit(nextChar) == FALSE) { |
| break; |
| } |
| int32_t nextDigitVal = u_charDigitValue(nextChar); |
| if (groupNum*10 + nextDigitVal > numCaptureGroups) { |
| // Don't consume the next digit if it makes the capture group number too big. |
| if (numDigits == 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| break; |
| } |
| (void)UTEXT_NEXT32(replacement); |
| groupNum=groupNum*10 + nextDigitVal; |
| ++numDigits; |
| } |
| } else { |
| // $ not followed by capture group name or number. |
| status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
| } |
| |
| if (U_SUCCESS(status)) { |
| destLen += appendGroup(groupNum, dest, status); |
| } |
| } // End of $ capture group handling |
| } // End of per-character loop through the replacement string. |
| |
| return *this; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // appendTail Intended to be used in conjunction with appendReplacement() |
| // To the destination string, append everything following |
| // the last match position from the input string. |
| // |
| // Note: Match ranges do not affect appendTail or appendReplacement |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { |
| UErrorCode status = U_ZERO_ERROR; |
| UText resultText = UTEXT_INITIALIZER; |
| utext_openUnicodeString(&resultText, &dest, &status); |
| |
| if (U_SUCCESS(status)) { |
| appendTail(&resultText, status); |
| utext_close(&resultText); |
| } |
| |
| return dest; |
| } |
| |
| // |
| // appendTail, UText mode |
| // |
| UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| if (fInputLength > fAppendPosition) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| int64_t destLen = utext_nativeLength(dest); |
| utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, |
| (int32_t)(fInputLength-fAppendPosition), &status); |
| } else { |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)(fInputLength-fAppendPosition); |
| } else { |
| len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); |
| status = U_ZERO_ERROR; // buffer overflow |
| } |
| |
| UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); |
| if (inputChars == NULL) { |
| fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
| } else { |
| utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated |
| int64_t destLen = utext_nativeLength(dest); |
| utext_replace(dest, destLen, destLen, inputChars, len16, &status); |
| uprv_free(inputChars); |
| } |
| } |
| } |
| return dest; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // end |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::end(UErrorCode &err) const { |
| return end(0, err); |
| } |
| |
| int64_t RegexMatcher::end64(UErrorCode &err) const { |
| return end64(0, err); |
| } |
| |
| int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { |
| if (U_FAILURE(err)) { |
| return -1; |
| } |
| if (fMatch == FALSE) { |
| err = U_REGEX_INVALID_STATE; |
| return -1; |
| } |
| if (group < 0 || group > fPattern->fGroupMap->size()) { |
| err = U_INDEX_OUTOFBOUNDS_ERROR; |
| return -1; |
| } |
| int64_t e = -1; |
| if (group == 0) { |
| e = fMatchEnd; |
| } else { |
| // Get the position within the stack frame of the variables for |
| // this capture group. |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| e = fFrame->fExtra[groupOffset + 1]; |
| } |
| |
| return e; |
| } |
| |
| int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { |
| return (int32_t)end64(group, err); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // findProgressInterrupt This function is called once for each advance in the target |
| // string from the find() function, and calls the user progress callback |
| // function if there is one installed. |
| // |
| // Return: TRUE if the find operation is to be terminated. |
| // FALSE if the find operation is to continue running. |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { |
| if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) { |
| status = U_REGEX_STOPPED_BY_CALLER; |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // find() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::find() { |
| if (U_FAILURE(fDeferredStatus)) { |
| return FALSE; |
| } |
| UErrorCode status = U_ZERO_ERROR; |
| UBool result = find(status); |
| return result; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // find() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::find(UErrorCode &status) { |
| // Start at the position of the last match end. (Will be zero if the |
| // matcher has been reset.) |
| // |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| return findUsingChunk(status); |
| } |
| |
| int64_t startPos = fMatchEnd; |
| if (startPos==0) { |
| startPos = fActiveStart; |
| } |
| |
| if (fMatch) { |
| // Save the position of any previous successful match. |
| fLastMatchEnd = fMatchEnd; |
| |
| if (fMatchStart == fMatchEnd) { |
| // Previous match had zero length. Move start position up one position |
| // to avoid sending find() into a loop on zero-length matches. |
| if (startPos >= fActiveLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| (void)UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } else { |
| if (fLastMatchEnd >= 0) { |
| // A previous find() failed to match. Don't try again. |
| // (without this test, a pattern with a zero-length match |
| // could match again at the end of an input string.) |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| } |
| |
| |
| // Compute the position in the input string beyond which a match can not begin, because |
| // the minimum length match would extend past the end of the input. |
| // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. |
| // Be aware of possible overflows if making changes here. |
| int64_t testStartLimit; |
| if (UTEXT_USES_U16(fInputText)) { |
| testStartLimit = fActiveLimit - fPattern->fMinMatchLen; |
| if (startPos > testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| } else { |
| // We don't know exactly how long the minimum match length is in native characters. |
| // Treat anything > 0 as 1. |
| testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); |
| } |
| |
| UChar32 c; |
| U_ASSERT(startPos >= 0); |
| |
| switch (fPattern->fStartType) { |
| case START_NO_INFO: |
| // No optimization was found. |
| // Try a match at each input position. |
| for (;;) { |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| if (startPos >= testStartLimit) { |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| (void)UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testStartLimit the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| U_ASSERT(FALSE); |
| |
| case START_START: |
| // Matches are only possible at the start of the input string |
| // (pattern begins with ^ or \A) |
| if (startPos > fActiveStart) { |
| fMatch = FALSE; |
| return FALSE; |
| } |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| return fMatch; |
| |
| |
| case START_SET: |
| { |
| // Match may start on any char from a pre-computed set. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| for (;;) { |
| int64_t pos = startPos; |
| c = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // c will be -1 (U_SENTINEL) at end of text, in which case we |
| // skip this next block (so we don't have a negative array index) |
| // and handle end of text in the following block. |
| if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || |
| (c>=256 && fPattern->fInitialChars->contains(c)))) { |
| MatchAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, pos); |
| } |
| if (startPos > testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| U_ASSERT(FALSE); |
| |
| case START_STRING: |
| case START_CHAR: |
| { |
| // Match starts on exactly one char. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| UChar32 theChar = fPattern->fInitialChar; |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| for (;;) { |
| int64_t pos = startPos; |
| c = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| if (c == theChar) { |
| MatchAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, pos); |
| } |
| if (startPos > testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| U_ASSERT(FALSE); |
| |
| case START_LINE: |
| { |
| UChar32 c; |
| if (startPos == fAnchorStart) { |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| c = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| c = UTEXT_PREVIOUS32(fInputText); |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| } |
| |
| if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| for (;;) { |
| if (c == 0x0a) { |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| } |
| if (startPos >= testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| c = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testStartLimit the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } else { |
| for (;;) { |
| if (isLineTerminator(c)) { |
| if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { |
| (void)UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| MatchAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, startPos); |
| } |
| if (startPos >= testStartLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| c = UTEXT_NEXT32(fInputText); |
| startPos = UTEXT_GETNATIVEINDEX(fInputText); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testStartLimit the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| } |
| |
| default: |
| U_ASSERT(FALSE); |
| } |
| |
| U_ASSERT(FALSE); |
| return FALSE; |
| } |
| |
| |
| |
| UBool RegexMatcher::find(int64_t start, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| this->reset(); // Note: Reset() is specified by Java Matcher documentation. |
| // This will reset the region to be the full input length. |
| if (start < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| int64_t nativeStart = start; |
| if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| fMatchEnd = nativeStart; |
| return find(status); |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // findUsingChunk() -- like find(), but with the advance knowledge that the |
| // entire string is available in the UText's chunk buffer. |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::findUsingChunk(UErrorCode &status) { |
| // Start at the position of the last match end. (Will be zero if the |
| // matcher has been reset. |
| // |
| |
| int32_t startPos = (int32_t)fMatchEnd; |
| if (startPos==0) { |
| startPos = (int32_t)fActiveStart; |
| } |
| |
| const UChar *inputBuf = fInputText->chunkContents; |
| |
| if (fMatch) { |
| // Save the position of any previous successful match. |
| fLastMatchEnd = fMatchEnd; |
| |
| if (fMatchStart == fMatchEnd) { |
| // Previous match had zero length. Move start position up one position |
| // to avoid sending find() into a loop on zero-length matches. |
| if (startPos >= fActiveLimit) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fInputLength); |
| } |
| } else { |
| if (fLastMatchEnd >= 0) { |
| // A previous find() failed to match. Don't try again. |
| // (without this test, a pattern with a zero-length match |
| // could match again at the end of an input string.) |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| } |
| |
| |
| // Compute the position in the input string beyond which a match can not begin, because |
| // the minimum length match would extend past the end of the input. |
| // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. |
| // Be aware of possible overflows if making changes here. |
| // Note: a match can begin at inputBuf + testLen; it is an inclusive limit. |
| int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); |
| if (startPos > testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| |
| UChar32 c; |
| U_ASSERT(startPos >= 0); |
| |
| switch (fPattern->fStartType) { |
| case START_NO_INFO: |
| // No optimization was found. |
| // Try a match at each input position. |
| for (;;) { |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| if (startPos >= testLen) { |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testLen the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| U_ASSERT(FALSE); |
| |
| case START_START: |
| // Matches are only possible at the start of the input string |
| // (pattern begins with ^ or \A) |
| if (startPos > fActiveStart) { |
| fMatch = FALSE; |
| return FALSE; |
| } |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| return fMatch; |
| |
| |
| case START_SET: |
| { |
| // Match may start on any char from a pre-computed set. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| for (;;) { |
| int32_t pos = startPos; |
| U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; |
| if ((c<256 && fPattern->fInitialChars8->contains(c)) || |
| (c>=256 && fPattern->fInitialChars->contains(c))) { |
| MatchChunkAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos > testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| U_ASSERT(FALSE); |
| |
| case START_STRING: |
| case START_CHAR: |
| { |
| // Match starts on exactly one char. |
| U_ASSERT(fPattern->fMinMatchLen > 0); |
| UChar32 theChar = fPattern->fInitialChar; |
| for (;;) { |
| int32_t pos = startPos; |
| U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; |
| if (c == theChar) { |
| MatchChunkAt(pos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos > testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| U_ASSERT(FALSE); |
| |
| case START_LINE: |
| { |
| UChar32 c; |
| if (startPos == fAnchorStart) { |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| } |
| |
| if (fPattern->fFlags & UREGEX_UNIX_LINES) { |
| for (;;) { |
| c = inputBuf[startPos-1]; |
| if (c == 0x0a) { |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos >= testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testLen the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } else { |
| for (;;) { |
| c = inputBuf[startPos-1]; |
| if (isLineTerminator(c)) { |
| if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { |
| startPos++; |
| } |
| MatchChunkAt(startPos, FALSE, status); |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (fMatch) { |
| return TRUE; |
| } |
| } |
| if (startPos >= testLen) { |
| fMatch = FALSE; |
| fHitEnd = TRUE; |
| return FALSE; |
| } |
| U16_FWD_1(inputBuf, startPos, fActiveLimit); |
| // Note that it's perfectly OK for a pattern to have a zero-length |
| // match at the end of a string, so we must make sure that the loop |
| // runs with startPos == testLen the last time through. |
| if (findProgressInterrupt(startPos, status)) |
| return FALSE; |
| } |
| } |
| } |
| |
| default: |
| U_ASSERT(FALSE); |
| } |
| |
| U_ASSERT(FALSE); |
| return FALSE; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // group() |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString RegexMatcher::group(UErrorCode &status) const { |
| return group(0, status); |
| } |
| |
| // Return immutable shallow clone |
| UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { |
| return group(0, dest, group_len, status); |
| } |
| |
| // Return immutable shallow clone |
| UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { |
| group_len = 0; |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| } else if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| |
| int64_t s, e; |
| if (groupNum == 0) { |
| s = fMatchStart; |
| e = fMatchEnd; |
| } else { |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| s = fFrame->fExtra[groupOffset]; |
| e = fFrame->fExtra[groupOffset+1]; |
| } |
| |
| if (s < 0) { |
| // A capture group wasn't part of the match |
| return utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| } |
| U_ASSERT(s <= e); |
| group_len = e - s; |
| |
| dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); |
| if (dest) |
| UTEXT_SETNATIVEINDEX(dest, s); |
| return dest; |
| } |
| |
| UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { |
| UnicodeString result; |
| int64_t groupStart = start64(groupNum, status); |
| int64_t groupEnd = end64(groupNum, status); |
| if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { |
| return result; |
| } |
| |
| // Get the group length using a utext_extract preflight. |
| // UText is actually pretty efficient at this when underlying encoding is UTF-16. |
| int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status); |
| if (status != U_BUFFER_OVERFLOW_ERROR) { |
| return result; |
| } |
| |
| status = U_ZERO_ERROR; |
| UChar *buf = result.getBuffer(length); |
| if (buf == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } else { |
| int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); |
| result.releaseBuffer(extractLength); |
| U_ASSERT(length == extractLength); |
| } |
| return result; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // appendGroup() -- currently internal only, appends a group to a UText rather |
| // than replacing its contents |
| // |
| //-------------------------------------------------------------------------------- |
| |
| int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return 0; |
| } |
| int64_t destLen = utext_nativeLength(dest); |
| |
| if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| } |
| if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| } |
| |
| int64_t s, e; |
| if (groupNum == 0) { |
| s = fMatchStart; |
| e = fMatchEnd; |
| } else { |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| s = fFrame->fExtra[groupOffset]; |
| e = fFrame->fExtra[groupOffset+1]; |
| } |
| |
| if (s < 0) { |
| // A capture group wasn't part of the match |
| return utext_replace(dest, destLen, destLen, NULL, 0, &status); |
| } |
| U_ASSERT(s <= e); |
| |
| int64_t deltaLen; |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| U_ASSERT(e <= fInputLength); |
| deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); |
| } else { |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)(e-s); |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); |
| } |
| UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); |
| if (groupChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| utext_extract(fInputText, s, e, groupChars, len16+1, &status); |
| |
| deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); |
| uprv_free(groupChars); |
| } |
| return deltaLen; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // groupCount() |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::groupCount() const { |
| return fPattern->fGroupMap->size(); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // hasAnchoringBounds() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::hasAnchoringBounds() const { |
| return fAnchoringBounds; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // hasTransparentBounds() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::hasTransparentBounds() const { |
| return fTransparentBounds; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // hitEnd() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::hitEnd() const { |
| return fHitEnd; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // input() |
| // |
| //-------------------------------------------------------------------------------- |
| const UnicodeString &RegexMatcher::input() const { |
| if (!fInput) { |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t len16; |
| if (UTEXT_USES_U16(fInputText)) { |
| len16 = (int32_t)fInputLength; |
| } else { |
| len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); |
| status = U_ZERO_ERROR; // overflow, length status |
| } |
| UnicodeString *result = new UnicodeString(len16, 0, 0); |
| |
| UChar *inputChars = result->getBuffer(len16); |
| utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning |
| result->releaseBuffer(len16); |
| |
| (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= |
| } |
| |
| return *fInput; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // inputText() |
| // |
| //-------------------------------------------------------------------------------- |
| UText *RegexMatcher::inputText() const { |
| return fInputText; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getInput() -- like inputText(), but makes a clone or copies into another UText |
| // |
| //-------------------------------------------------------------------------------- |
| UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| if (dest) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); |
| } else { |
| int32_t input16Len; |
| if (UTEXT_USES_U16(fInputText)) { |
| input16Len = (int32_t)fInputLength; |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error |
| } |
| UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); |
| if (inputChars == NULL) { |
| return dest; |
| } |
| |
| status = U_ZERO_ERROR; |
| utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning |
| status = U_ZERO_ERROR; |
| utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); |
| |
| uprv_free(inputChars); |
| } |
| return dest; |
| } else { |
| return utext_clone(NULL, fInputText, FALSE, TRUE, &status); |
| } |
| } |
| |
| |
| static UBool compat_SyncMutableUTextContents(UText *ut); |
| static UBool compat_SyncMutableUTextContents(UText *ut) { |
| UBool retVal = FALSE; |
| |
| // In the following test, we're really only interested in whether the UText should switch |
| // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents |
| // will still point to the correct data. |
| if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { |
| UnicodeString *us=(UnicodeString *)ut->context; |
| |
| // Update to the latest length. |
| // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). |
| int32_t newLength = us->length(); |
| |
| // Update the chunk description. |
| // The buffer may have switched between stack- and heap-based. |
| ut->chunkContents = us->getBuffer(); |
| ut->chunkLength = newLength; |
| ut->chunkNativeLimit = newLength; |
| ut->nativeIndexingLimit = newLength; |
| retVal = TRUE; |
| } |
| |
| return retVal; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // lookingAt() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::lookingAt(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| else { |
| resetPreserveRegion(); |
| } |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)fActiveStart, FALSE, status); |
| } else { |
| MatchAt(fActiveStart, FALSE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| reset(); |
| |
| if (start < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| |
| int64_t nativeStart; |
| nativeStart = start; |
| if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)nativeStart, FALSE, status); |
| } else { |
| MatchAt(nativeStart, FALSE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // matches() |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::matches(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| else { |
| resetPreserveRegion(); |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)fActiveStart, TRUE, status); |
| } else { |
| MatchAt(fActiveStart, TRUE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return FALSE; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return FALSE; |
| } |
| reset(); |
| |
| if (start < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (fInputUniStrMaybeMutable) { |
| if (compat_SyncMutableUTextContents(fInputText)) { |
| fInputLength = utext_nativeLength(fInputText); |
| reset(); |
| } |
| } |
| |
| int64_t nativeStart; |
| nativeStart = start; |
| if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return FALSE; |
| } |
| |
| if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { |
| MatchChunkAt((int32_t)nativeStart, TRUE, status); |
| } else { |
| MatchAt(nativeStart, TRUE, status); |
| } |
| return fMatch; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // pattern |
| // |
| //-------------------------------------------------------------------------------- |
| const RegexPattern &RegexMatcher::pattern() const { |
| return *fPattern; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // region |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| |
| if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| |
| int64_t nativeStart = regionStart; |
| int64_t nativeLimit = regionLimit; |
| if (nativeStart > fInputLength || nativeLimit > fInputLength) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| |
| if (startIndex == -1) |
| this->reset(); |
| else |
| resetPreserveRegion(); |
| |
| fRegionStart = nativeStart; |
| fRegionLimit = nativeLimit; |
| fActiveStart = nativeStart; |
| fActiveLimit = nativeLimit; |
| |
| if (startIndex != -1) { |
| if (startIndex < fActiveStart || startIndex > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| } |
| fMatchEnd = startIndex; |
| } |
| |
| if (!fTransparentBounds) { |
| fLookStart = nativeStart; |
| fLookLimit = nativeLimit; |
| } |
| if (fAnchoringBounds) { |
| fAnchorStart = nativeStart; |
| fAnchorLimit = nativeLimit; |
| } |
| return *this; |
| } |
| |
| RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { |
| return region(start, limit, -1, status); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // regionEnd |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::regionEnd() const { |
| return (int32_t)fRegionLimit; |
| } |
| |
| int64_t RegexMatcher::regionEnd64() const { |
| return fRegionLimit; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // regionStart |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::regionStart() const { |
| return (int32_t)fRegionStart; |
| } |
| |
| int64_t RegexMatcher::regionStart64() const { |
| return fRegionStart; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // replaceAll |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { |
| UText replacementText = UTEXT_INITIALIZER; |
| UText resultText = UTEXT_INITIALIZER; |
| UnicodeString resultString; |
| if (U_FAILURE(status)) { |
| return resultString; |
| } |
| |
| utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| utext_openUnicodeString(&resultText, &resultString, &status); |
| |
| replaceAll(&replacementText, &resultText, status); |
| |
| utext_close(&resultText); |
| utext_close(&replacementText); |
| |
| return resultString; |
| } |
| |
| |
| // |
| // replaceAll, UText mode |
| // |
| UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| if (dest == NULL) { |
| UnicodeString emptyString; |
| UText empty = UTEXT_INITIALIZER; |
| |
| utext_openUnicodeString(&empty, &emptyString, &status); |
| dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| utext_close(&empty); |
| } |
| |
| if (U_SUCCESS(status)) { |
| reset(); |
| while (find()) { |
| appendReplacement(dest, replacement, status); |
| if (U_FAILURE(status)) { |
| break; |
| } |
| } |
| appendTail(dest, status); |
| } |
| |
| return dest; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // replaceFirst |
| // |
| //-------------------------------------------------------------------------------- |
| UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { |
| UText replacementText = UTEXT_INITIALIZER; |
| UText resultText = UTEXT_INITIALIZER; |
| UnicodeString resultString; |
| |
| utext_openConstUnicodeString(&replacementText, &replacement, &status); |
| utext_openUnicodeString(&resultText, &resultString, &status); |
| |
| replaceFirst(&replacementText, &resultText, status); |
| |
| utext_close(&resultText); |
| utext_close(&replacementText); |
| |
| return resultString; |
| } |
| |
| // |
| // replaceFirst, UText mode |
| // |
| UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return dest; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return dest; |
| } |
| |
| reset(); |
| if (!find()) { |
| return getInput(dest, status); |
| } |
| |
| if (dest == NULL) { |
| UnicodeString emptyString; |
| UText empty = UTEXT_INITIALIZER; |
| |
| utext_openUnicodeString(&empty, &emptyString, &status); |
| dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); |
| utext_close(&empty); |
| } |
| |
| appendReplacement(dest, replacement, status); |
| appendTail(dest, status); |
| |
| return dest; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // requireEnd |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::requireEnd() const { |
| return fRequireEnd; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // reset |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::reset() { |
| fRegionStart = 0; |
| fRegionLimit = fInputLength; |
| fActiveStart = 0; |
| fActiveLimit = fInputLength; |
| fAnchorStart = 0; |
| fAnchorLimit = fInputLength; |
| fLookStart = 0; |
| fLookLimit = fInputLength; |
| resetPreserveRegion(); |
| return *this; |
| } |
| |
| |
| |
| void RegexMatcher::resetPreserveRegion() { |
| fMatchStart = 0; |
| fMatchEnd = 0; |
| fLastMatchEnd = -1; |
| fAppendPosition = 0; |
| fMatch = FALSE; |
| fHitEnd = FALSE; |
| fRequireEnd = FALSE; |
| fTime = 0; |
| fTickCounter = TIMER_INITIAL_VALUE; |
| //resetStack(); // more expensive than it looks... |
| } |
| |
| |
| RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { |
| fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); |
| if (fPattern->fNeedsAltInput) { |
| fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| return *this; |
| } |
| fInputLength = utext_nativeLength(fInputText); |
| |
| reset(); |
| delete fInput; |
| fInput = NULL; |
| |
| // Do the following for any UnicodeString. |
| // This is for compatibility for those clients who modify the input string "live" during regex operations. |
| fInputUniStrMaybeMutable = TRUE; |
| |
| if (fWordBreakItr != NULL) { |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| UErrorCode status = U_ZERO_ERROR; |
| fWordBreakItr->setText(fInputText, status); |
| #endif |
| } |
| return *this; |
| } |
| |
| |
| RegexMatcher &RegexMatcher::reset(UText *input) { |
| if (fInputText != input) { |
| fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); |
| if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); |
| if (U_FAILURE(fDeferredStatus)) { |
| return *this; |
| } |
| fInputLength = utext_nativeLength(fInputText); |
| |
| delete fInput; |
| fInput = NULL; |
| |
| if (fWordBreakItr != NULL) { |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| UErrorCode status = U_ZERO_ERROR; |
| fWordBreakItr->setText(input, status); |
| #endif |
| } |
| } |
| reset(); |
| fInputUniStrMaybeMutable = FALSE; |
| |
| return *this; |
| } |
| |
| /*RegexMatcher &RegexMatcher::reset(const UChar *) { |
| fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; |
| return *this; |
| }*/ |
| |
| RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| reset(); // Reset also resets the region to be the entire string. |
| |
| if (position < 0 || position > fActiveLimit) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return *this; |
| } |
| fMatchEnd = position; |
| return *this; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // refresh |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| if (input == NULL) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return *this; |
| } |
| if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return *this; |
| } |
| int64_t pos = utext_getNativeIndex(fInputText); |
| // Shallow read-only clone of the new UText into the existing input UText |
| fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| utext_setNativeIndex(fInputText, pos); |
| |
| if (fAltInputText != NULL) { |
| pos = utext_getNativeIndex(fAltInputText); |
| fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); |
| if (U_FAILURE(status)) { |
| return *this; |
| } |
| utext_setNativeIndex(fAltInputText, pos); |
| } |
| return *this; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setTrace |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setTrace(UBool state) { |
| fTraceDebug = state; |
| } |
| |
| |
| |
| /** |
| * UText, replace entire contents of the destination UText with a substring of the source UText. |
| * |
| * @param src The source UText |
| * @param dest The destination UText. Must be writable. |
| * May be NULL, in which case a new UText will be allocated. |
| * @param start Start index of source substring. |
| * @param limit Limit index of source substring. |
| * @param status An error code. |
| */ |
| static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { |
| if (U_FAILURE(*status)) { |
| return dest; |
| } |
| if (start == limit) { |
| if (dest) { |
| utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status); |
| return dest; |
| } else { |
| return utext_openUChars(NULL, NULL, 0, status); |
| } |
| } |
| int32_t length = utext_extract(src, start, limit, NULL, 0, status); |
| if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { |
| return dest; |
| } |
| *status = U_ZERO_ERROR; |
| MaybeStackArray<UChar, 40> buffer; |
| if (length >= buffer.getCapacity()) { |
| UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. |
| if (newBuf == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| utext_extract(src, start, limit, buffer.getAlias(), length+1, status); |
| if (dest) { |
| utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); |
| return dest; |
| } |
| |
| // Caller did not provide a prexisting UText. |
| // Open a new one, and have it adopt the text buffer storage. |
| if (U_FAILURE(*status)) { |
| return NULL; |
| } |
| int32_t ownedLength = 0; |
| UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); |
| if (ownedBuf == NULL) { |
| *status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| UText *result = utext_openUChars(NULL, ownedBuf, length, status); |
| if (U_FAILURE(*status)) { |
| uprv_free(ownedBuf); |
| return NULL; |
| } |
| result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); |
| return result; |
| } |
| |
| |
| //--------------------------------------------------------------------- |
| // |
| // split |
| // |
| //--------------------------------------------------------------------- |
| int32_t RegexMatcher::split(const UnicodeString &input, |
| UnicodeString dest[], |
| int32_t destCapacity, |
| UErrorCode &status) |
| { |
| UText inputText = UTEXT_INITIALIZER; |
| utext_openConstUnicodeString(&inputText, &input, &status); |
| if (U_FAILURE(status)) { |
| return 0; |
| } |
| |
| UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); |
| if (destText == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| int32_t i; |
| for (i = 0; i < destCapacity; i++) { |
| destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); |
| } |
| |
| int32_t fieldCount = split(&inputText, destText, destCapacity, status); |
| |
| for (i = 0; i < destCapacity; i++) { |
| utext_close(destText[i]); |
| } |
| |
| uprv_free(destText); |
| utext_close(&inputText); |
| return fieldCount; |
| } |
| |
| // |
| // split, UText mode |
| // |
| int32_t RegexMatcher::split(UText *input, |
| UText *dest[], |
| int32_t destCapacity, |
| UErrorCode &status) |
| { |
| // |
| // Check arguements for validity |
| // |
| if (U_FAILURE(status)) { |
| return 0; |
| }; |
| |
| if (destCapacity < 1) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| // |
| // Reset for the input text |
| // |
| reset(input); |
| int64_t nextOutputStringStart = 0; |
| if (fActiveLimit == 0) { |
| return 0; |
| } |
| |
| // |
| // Loop through the input text, searching for the delimiter pattern |
| // |
| int32_t i; |
| int32_t numCaptureGroups = fPattern->fGroupMap->size(); |
| for (i=0; ; i++) { |
| if (i>=destCapacity-1) { |
| // There is one or zero output string left. |
| // Fill the last output string with whatever is left from the input, then exit the loop. |
| // ( i will be == destCapacity if we filled the output array while processing |
| // capture groups of the delimiter expression, in which case we will discard the |
| // last capture group saved in favor of the unprocessed remainder of the |
| // input string.) |
| i = destCapacity-1; |
| if (fActiveLimit > nextOutputStringStart) { |
| if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| input->chunkContents+nextOutputStringStart, |
| (int32_t)(fActiveLimit-nextOutputStringStart), &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, |
| fActiveLimit-nextOutputStringStart, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| int32_t remaining16Length = |
| utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); |
| UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); |
| if (remainingChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| |
| utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| |
| uprv_free(remainingChars); |
| } |
| } |
| break; |
| } |
| if (find()) { |
| // We found another delimiter. Move everything from where we started looking |
| // up until the start of the delimiter into the next output string. |
| if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| input->chunkContents+nextOutputStringStart, |
| (int32_t)(fMatchStart-nextOutputStringStart), &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, |
| fMatchStart-nextOutputStringStart, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); |
| UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); |
| if (remainingChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| |
| uprv_free(remainingChars); |
| } |
| nextOutputStringStart = fMatchEnd; |
| |
| // If the delimiter pattern has capturing parentheses, the captured |
| // text goes out into the next n destination strings. |
| int32_t groupNum; |
| for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { |
| if (i >= destCapacity-2) { |
| // Never fill the last available output string with capture group text. |
| // It will filled with the last field, the remainder of the |
| // unsplit input text. |
| break; |
| } |
| i++; |
| dest[i] = utext_extract_replace(fInputText, dest[i], |
| start64(groupNum, status), end64(groupNum, status), &status); |
| } |
| |
| if (nextOutputStringStart == fActiveLimit) { |
| // The delimiter was at the end of the string. We're done, but first |
| // we output one last empty string, for the empty field following |
| // the delimiter at the end of input. |
| if (i+1 < destCapacity) { |
| ++i; |
| if (dest[i] == NULL) { |
| dest[i] = utext_openUChars(NULL, NULL, 0, &status); |
| } else { |
| static UChar emptyString[] = {(UChar)0}; |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); |
| } |
| } |
| break; |
| |
| } |
| } |
| else |
| { |
| // We ran off the end of the input while looking for the next delimiter. |
| // All the remaining text goes into the current output string. |
| if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), |
| input->chunkContents+nextOutputStringStart, |
| (int32_t)(fActiveLimit-nextOutputStringStart), &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, |
| fActiveLimit-nextOutputStringStart, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| } else { |
| UErrorCode lengthStatus = U_ZERO_ERROR; |
| int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); |
| UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); |
| if (remainingChars == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| |
| utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); |
| if (dest[i]) { |
| utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); |
| } else { |
| UText remainingText = UTEXT_INITIALIZER; |
| utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); |
| dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); |
| utext_close(&remainingText); |
| } |
| |
| uprv_free(remainingChars); |
| } |
| break; |
| } |
| if (U_FAILURE(status)) { |
| break; |
| } |
| } // end of for loop |
| return i+1; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // start |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::start(UErrorCode &status) const { |
| return start(0, status); |
| } |
| |
| int64_t RegexMatcher::start64(UErrorCode &status) const { |
| return start64(0, status); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // start(int32_t group, UErrorCode &status) |
| // |
| //-------------------------------------------------------------------------------- |
| |
| int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { |
| if (U_FAILURE(status)) { |
| return -1; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return -1; |
| } |
| if (fMatch == FALSE) { |
| status = U_REGEX_INVALID_STATE; |
| return -1; |
| } |
| if (group < 0 || group > fPattern->fGroupMap->size()) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| return -1; |
| } |
| int64_t s; |
| if (group == 0) { |
| s = fMatchStart; |
| } else { |
| int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); |
| U_ASSERT(groupOffset < fPattern->fFrameSize); |
| U_ASSERT(groupOffset >= 0); |
| s = fFrame->fExtra[groupOffset]; |
| } |
| |
| return s; |
| } |
| |
| |
| int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { |
| return (int32_t)start64(group, status); |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // useAnchoringBounds |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { |
| fAnchoringBounds = b; |
| fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); |
| fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); |
| return *this; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // useTransparentBounds |
| // |
| //-------------------------------------------------------------------------------- |
| RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { |
| fTransparentBounds = b; |
| fLookStart = (fTransparentBounds ? 0 : fRegionStart); |
| fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); |
| return *this; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setTimeLimit |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return; |
| } |
| if (limit < 0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| fTimeLimit = limit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getTimeLimit |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::getTimeLimit() const { |
| return fTimeLimit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setStackLimit |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return; |
| } |
| if (limit < 0) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| // Reset the matcher. This is needed here in case there is a current match |
| // whose final stack frame (containing the match results, pointed to by fFrame) |
| // would be lost by resizing to a smaller stack size. |
| reset(); |
| |
| if (limit == 0) { |
| // Unlimited stack expansion |
| fStack->setMaxCapacity(0); |
| } else { |
| // Change the units of the limit from bytes to ints, and bump the size up |
| // to be big enough to hold at least one stack frame for the pattern, |
| // if it isn't there already. |
| int32_t adjustedLimit = limit / sizeof(int32_t); |
| if (adjustedLimit < fPattern->fFrameSize) { |
| adjustedLimit = fPattern->fFrameSize; |
| } |
| fStack->setMaxCapacity(adjustedLimit); |
| } |
| fStackLimit = limit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getStackLimit |
| // |
| //-------------------------------------------------------------------------------- |
| int32_t RegexMatcher::getStackLimit() const { |
| return fStackLimit; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, |
| const void *context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fCallbackFn = callback; |
| fCallbackContext = context; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, |
| const void *&context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| callback = fCallbackFn; |
| context = fCallbackContext; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // setMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, |
| const void *context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fFindProgressCallbackFn = callback; |
| fFindProgressCallbackContext = context; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // getMatchCallback |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, |
| const void *&context, |
| UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| callback = fFindProgressCallbackFn; |
| context = fFindProgressCallbackContext; |
| } |
| |
| |
| //================================================================================ |
| // |
| // Code following this point in this file is the internal |
| // Match Engine Implementation. |
| // |
| //================================================================================ |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // resetStack |
| // Discard any previous contents of the state save stack, and initialize a |
| // new stack frame to all -1. The -1s are needed for capture group limits, |
| // where they indicate that a group has not yet matched anything. |
| //-------------------------------------------------------------------------------- |
| REStackFrame *RegexMatcher::resetStack() { |
| // Discard any previous contents of the state save stack, and initialize a |
| // new stack frame with all -1 data. The -1s are needed for capture group limits, |
| // where they indicate that a group has not yet matched anything. |
| fStack->removeAllElements(); |
| |
| REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); |
| if(U_FAILURE(fDeferredStatus)) { |
| return NULL; |
| } |
| |
| int32_t i; |
| for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { |
| iFrame->fExtra[i] = -1; |
| } |
| return iFrame; |
| } |
| |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // isWordBoundary |
| // in perl, "xab..cd..", \b is true at positions 0,3,5,7 |
| // For us, |
| // If the current char is a combining mark, |
| // \b is FALSE. |
| // Else Scan backwards to the first non-combining char. |
| // We are at a boundary if the this char and the original chars are |
| // opposite in membership in \w set |
| // |
| // parameters: pos - the current position in the input buffer |
| // |
| // TODO: double-check edge cases at region boundaries. |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::isWordBoundary(int64_t pos) { |
| UBool isBoundary = FALSE; |
| UBool cIsWord = FALSE; |
| |
| if (pos >= fLookLimit) { |
| fHitEnd = TRUE; |
| } else { |
| // Determine whether char c at current position is a member of the word set of chars. |
| // If we're off the end of the string, behave as though we're not at a word char. |
| UTEXT_SETNATIVEINDEX(fInputText, pos); |
| UChar32 c = UTEXT_CURRENT32(fInputText); |
| if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { |
| // Current char is a combining one. Not a boundary. |
| return FALSE; |
| } |
| cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
| } |
| |
| // Back up until we come to a non-combining char, determine whether |
| // that char is a word char. |
| UBool prevCIsWord = FALSE; |
| for (;;) { |
| if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { |
| break; |
| } |
| UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); |
| if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); |
| break; |
| } |
| } |
| isBoundary = cIsWord ^ prevCIsWord; |
| return isBoundary; |
| } |
| |
| UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { |
| UBool isBoundary = FALSE; |
| UBool cIsWord = FALSE; |
| |
| const UChar *inputBuf = fInputText->chunkContents; |
| |
| if (pos >= fLookLimit) { |
| fHitEnd = TRUE; |
| } else { |
| // Determine whether char c at current position is a member of the word set of chars. |
| // If we're off the end of the string, behave as though we're not at a word char. |
| UChar32 c; |
| U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); |
| if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { |
| // Current char is a combining one. Not a boundary. |
| return FALSE; |
| } |
| cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); |
| } |
| |
| // Back up until we come to a non-combining char, determine whether |
| // that char is a word char. |
| UBool prevCIsWord = FALSE; |
| for (;;) { |
| if (pos <= fLookStart) { |
| break; |
| } |
| UChar32 prevChar; |
| U16_PREV(inputBuf, fLookStart, pos, prevChar); |
| if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) |
| || u_charType(prevChar) == U_FORMAT_CHAR)) { |
| prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); |
| break; |
| } |
| } |
| isBoundary = cIsWord ^ prevCIsWord; |
| return isBoundary; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // isUWordBoundary |
| // |
| // Test for a word boundary using RBBI word break. |
| // |
| // parameters: pos - the current position in the input buffer |
| // |
| //-------------------------------------------------------------------------------- |
| UBool RegexMatcher::isUWordBoundary(int64_t pos) { |
| UBool returnVal = FALSE; |
| #if UCONFIG_NO_BREAK_ITERATION==0 |
| |
| // If we haven't yet created a break iterator for this matcher, do it now. |
| if (fWordBreakItr == NULL) { |
| fWordBreakItr = |
| (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); |
| if (U_FAILURE(fDeferredStatus)) { |
| return FALSE; |
| } |
| fWordBreakItr->setText(fInputText, fDeferredStatus); |
| } |
| |
| if (pos >= fLookLimit) { |
| fHitEnd = TRUE; |
| returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" |
| // words are not boundaries. All non-word chars stand by themselves, |
| // with word boundaries on both sides. |
| } else { |
| if (!UTEXT_USES_U16(fInputText)) { |
| // !!!: Would like a better way to do this! |
| UErrorCode status = U_ZERO_ERROR; |
| pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); |
| } |
| returnVal = fWordBreakItr->isBoundary((int32_t)pos); |
| } |
| #endif |
| return returnVal; |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // IncrementTime This function is called once each TIMER_INITIAL_VALUE state |
| // saves. Increment the "time" counter, and call the |
| // user callback function if there is one installed. |
| // |
| // If the match operation needs to be aborted, either for a time-out |
| // or because the user callback asked for it, just set an error status. |
| // The engine will pick that up and stop in its outer loop. |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::IncrementTime(UErrorCode &status) { |
| fTickCounter = TIMER_INITIAL_VALUE; |
| fTime++; |
| if (fCallbackFn != NULL) { |
| if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { |
| status = U_REGEX_STOPPED_BY_CALLER; |
| return; |
| } |
| } |
| if (fTimeLimit > 0 && fTime >= fTimeLimit) { |
| status = U_REGEX_TIME_OUT; |
| } |
| } |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // StateSave |
| // Make a new stack frame, initialized as a copy of the current stack frame. |
| // Set the pattern index in the original stack frame from the operand value |
| // in the opcode. Execution of the engine continues with the state in |
| // the newly created stack frame |
| // |
| // Note that reserveBlock() may grow the stack, resulting in the |
| // whole thing being relocated in memory. |
| // |
| // Parameters: |
| // fp The top frame pointer when called. At return, a new |
| // fame will be present |
| // savePatIdx An index into the compiled pattern. Goes into the original |
| // (not new) frame. If execution ever back-tracks out of the |
| // new frame, this will be where we continue from in the pattern. |
| // Return |
| // The new frame pointer. |
| // |
| //-------------------------------------------------------------------------------- |
| inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return fp; |
| } |
| // push storage for a new frame. |
| int64_t *newFP = fStack->reserveBlock(fFrameSize, status); |
| if (U_FAILURE(status)) { |
| // Failure on attempted stack expansion. |
| // Stack function set some other error code, change it to a more |
| // specific one for regular expressions. |
| status = U_REGEX_STACK_OVERFLOW; |
| // We need to return a writable stack frame, so just return the |
| // previous frame. The match operation will stop quickly |
| // because of the error status, after which the frame will never |
| // be looked at again. |
| return fp; |
| } |
| fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. |
| |
| // New stack frame = copy of old top frame. |
| int64_t *source = (int64_t *)fp; |
| int64_t *dest = newFP; |
| for (;;) { |
| *dest++ = *source++; |
| if (source == newFP) { |
| break; |
| } |
| } |
| |
| fTickCounter--; |
| if (fTickCounter <= 0) { |
| IncrementTime(status); // Re-initializes fTickCounter |
| } |
| fp->fPatIdx = savePatIdx; |
| return (REStackFrame *)newFP; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // MatchAt This is the actual matching engine. |
| // |
| // startIdx: begin matching a this index. |
| // toEnd: if true, match must extend to end of the input region |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
| UBool isMatch = FALSE; // True if the we have a match. |
| |
| int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards |
| |
| int32_t op; // Operation from the compiled pattern, split into |
| int32_t opType; // the opcode |
| int32_t opValue; // and the operand value. |
| |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) |
| { |
| printf("MatchAt(startIdx=%ld)\n", startIdx); |
| printf("Original Pattern: "); |
| UChar32 c = utext_next32From(fPattern->fPattern, 0); |
| while (c != U_SENTINEL) { |
| if (c<32 || c>256) { |
| c = '.'; |
| } |
| printf("%c", c); |
| |
| c = UTEXT_NEXT32(fPattern->fPattern); |
| } |
| printf("\n"); |
| printf("Input String: "); |
| c = utext_next32From(fInputText, 0); |
| while (c != U_SENTINEL) { |
| if (c<32 || c>256) { |
| c = '.'; |
| } |
| printf("%c", c); |
| |
| c = UTEXT_NEXT32(fInputText); |
| } |
| printf("\n"); |
| printf("\n"); |
| } |
| #endif |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| // Cache frequently referenced items from the compiled pattern |
| // |
| int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
| |
| const UChar *litText = fPattern->fLiteralText.getBuffer(); |
| UVector *sets = fPattern->fSets; |
| |
| fFrameSize = fPattern->fFrameSize; |
| REStackFrame *fp = resetStack(); |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return; |
| } |
| |
| fp->fPatIdx = 0; |
| fp->fInputIdx = startIdx; |
| |
| // Zero out the pattern's static data |
| int32_t i; |
| for (i = 0; i<fPattern->fDataSize; i++) { |
| fData[i] = 0; |
| } |
| |
| // |
| // Main loop for interpreting the compiled pattern. |
| // One iteration of the loop per pattern operation performed. |
| // |
| for (;;) { |
| op = (int32_t)pat[fp->fPatIdx]; |
| opType = URX_TYPE(op); |
| opValue = URX_VAL(op); |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, |
| UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); |
| fPattern->dumpOp(fp->fPatIdx); |
| } |
| #endif |
| fp->fPatIdx++; |
| |
| switch (opType) { |
| |
| |
| case URX_NOP: |
| break; |
| |
| |
| case URX_BACKTRACK: |
| // Force a backtrack. In some circumstances, the pattern compiler |
| // will notice that the pattern can't possibly match anything, and will |
| // emit one of these at that point. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_ONECHAR: |
| if (fp->fInputIdx < fActiveLimit) { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (c == opValue) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| break; |
| } |
| } else { |
| fHitEnd = TRUE; |
| } |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_STRING: |
| { |
| // Test input against a literal string. |
| // Strings require two slots in the compiled pattern, one for the |
| // offset to the string text, and one for the length. |
| |
| int32_t stringStartIdx = opValue; |
| op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand |
| fp->fPatIdx++; |
| opType = URX_TYPE(op); |
| int32_t stringLen = URX_VAL(op); |
| U_ASSERT(opType == URX_STRING_LEN); |
| U_ASSERT(stringLen >= 2); |
| |
| const UChar *patternString = litText+stringStartIdx; |
| int32_t patternStringIndex = 0; |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 inputChar; |
| UChar32 patternChar; |
| UBool success = TRUE; |
| while (patternStringIndex < stringLen) { |
| if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { |
| success = FALSE; |
| fHitEnd = TRUE; |
| break; |
| } |
| inputChar = UTEXT_NEXT32(fInputText); |
| U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); |
| if (patternChar != inputChar) { |
| success = FALSE; |
| break; |
| } |
| } |
| |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_STATE_SAVE: |
| fp = StateSave(fp, opValue, status); |
| break; |
| |
| |
| case URX_END: |
| // The match loop will exit via this path on a successful match, |
| // when we reach the end of the pattern. |
| if (toEnd && fp->fInputIdx != fActiveLimit) { |
| // The pattern matched, but not to the end of input. Try some more. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| isMatch = TRUE; |
| goto breakFromLoop; |
| |
| // Start and End Capture stack frame variables are laid out out like this: |
| // fp->fExtra[opValue] - The start of a completed capture group |
| // opValue+1 - The end of a completed capture group |
| // opValue+2 - the start of a capture group whose end |
| // has not yet been reached (and might not ever be). |
| case URX_START_CAPTURE: |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| fp->fExtra[opValue+2] = fp->fInputIdx; |
| break; |
| |
| |
| case URX_END_CAPTURE: |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. |
| fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. |
| fp->fExtra[opValue+1] = fp->fInputIdx; // End position |
| U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); |
| break; |
| |
| |
| case URX_DOLLAR: // $, test for End of line |
| // or for position before new line at end of input |
| { |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // If we are positioned just before a new-line that is located at the |
| // end of input, succeed. |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
| if (isLineTerminator(c)) { |
| // If not in the middle of a CR/LF sequence |
| if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { |
| // At new-line at end of input. Success |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| |
| break; |
| } |
| } |
| } else { |
| UChar32 nextC = UTEXT_NEXT32(fInputText); |
| if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; // At CR/LF at end of input. Success |
| } |
| } |
| |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // Off the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } else { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| // Either at the last character of input, or off the end. |
| if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| } |
| |
| // Not at end of input. Back-track out. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_DOLLAR_M: // $, test for End of line in multi-line mode |
| { |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| // If we are positioned just before a new-line, succeed. |
| // It makes no difference where the new-line is within the input. |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_CURRENT32(fInputText); |
| if (isLineTerminator(c)) { |
| // At a line end, except for the odd chance of being in the middle of a CR/LF sequence |
| // In multi-line mode, hitting a new-line just before the end of input does not |
| // set the hitEnd or requireEnd flags |
| if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { |
| break; |
| } |
| } |
| // not at a new line. Fail. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode |
| { |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; // Java set requireEnd in this case, even though |
| break; // adding a new-line would not lose the match. |
| } |
| // If we are not positioned just before a new-line, the test fails; backtrack out. |
| // It makes no difference where the new-line is within the input. |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| if (UTEXT_CURRENT32(fInputText) != 0x0a) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_CARET: // ^, test for start of line |
| if (fp->fInputIdx != fAnchorStart) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_CARET_M: // ^, test for start of line in mulit-line mode |
| { |
| if (fp->fInputIdx == fAnchorStart) { |
| // We are at the start input. Success. |
| break; |
| } |
| // Check whether character just before the current pos is a new-line |
| // unless we are at the end of input |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_PREVIOUS32(fInputText); |
| if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { |
| // It's a new-line. ^ is true. Success. |
| // TODO: what should be done with positions between a CR and LF? |
| break; |
| } |
| // Not at the start of a line. Fail. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode |
| { |
| U_ASSERT(fp->fInputIdx >= fAnchorStart); |
| if (fp->fInputIdx <= fAnchorStart) { |
| // We are at the start input. Success. |
| break; |
| } |
| // Check whether character just before the current pos is a new-line |
| U_ASSERT(fp->fInputIdx <= fAnchorLimit); |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_PREVIOUS32(fInputText); |
| if (c != 0x0a) { |
| // Not at the start of a line. Back-track out. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| case URX_BACKSLASH_B: // Test for word boundaries |
| { |
| UBool success = isWordBoundary(fp->fInputIdx); |
| success ^= (UBool)(opValue != 0); // flip sense for \B |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style |
| { |
| UBool success = isUWordBoundary(fp->fInputIdx); |
| success ^= (UBool)(opValue != 0); // flip sense for \B |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_D: // Test for decimal digit |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. |
| UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); |
| success ^= (UBool)(opValue != 0); // flip sense for \D |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_G: // Test for position at end of previous match |
| if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_H: // Test for \h, horizontal white space. |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| int8_t ctype = u_charType(c); |
| UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB |
| success ^= (UBool)(opValue != 0); // flip sense for \H |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_R: // Test for \R, any line break sequence. |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (isLineTerminator(c)) { |
| if (c == 0x0d && utext_current32(fInputText) == 0x0a) { |
| utext_next32(fInputText); |
| } |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_V: // \v, any single line ending character. |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| UBool success = isLineTerminator(c); |
| success ^= (UBool)(opValue != 0); // flip sense for \V |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_X: |
| // Match a Grapheme, as defined by Unicode TR 29. |
| // Differs slightly from Perl, which consumes combining marks independently |
| // of context. |
| { |
| |
| // Fail if at end of input |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // Examine (and consume) the current char. |
| // Dispatch into a little state machine, based on the char. |
| UChar32 c; |
| c = UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| UnicodeSet **sets = fPattern->fStaticSets; |
| if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; |
| if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; |
| if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| goto GC_Extend; |
| |
| |
| |
| GC_L: |
| if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| c = UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| (void)UTEXT_PREVIOUS32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| goto GC_Extend; |
| |
| GC_V: |
| if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| c = UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| (void)UTEXT_PREVIOUS32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| goto GC_Extend; |
| |
| GC_T: |
| if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| c = UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| (void)UTEXT_PREVIOUS32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| goto GC_Extend; |
| |
| GC_Extend: |
| // Combining characters are consumed here |
| for (;;) { |
| if (fp->fInputIdx >= fActiveLimit) { |
| break; |
| } |
| c = UTEXT_CURRENT32(fInputText); |
| if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
| break; |
| } |
| (void)UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| goto GC_Done; |
| |
| GC_Control: |
| // Most control chars stand alone (don't combine with combining chars), |
| // except for that CR/LF sequence is a single grapheme cluster. |
| if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { |
| c = UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| |
| GC_Done: |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| } |
| break; |
| } |
| |
| |
| |
| |
| case URX_BACKSLASH_Z: // Test for end of Input |
| if (fp->fInputIdx < fAnchorLimit) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } else { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| } |
| break; |
| |
| |
| |
| case URX_STATIC_SETREF: |
| { |
| // Test input character against one of the predefined sets |
| // (Word Characters, for example) |
| // The high bit of the op value is a flag for the match polarity. |
| // 0: success if input char is in set. |
| // 1: success if input char is not in set. |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
| opValue &= ~URX_NEG_SET; |
| U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (c < 256) { |
| Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| if (s8->contains(c)) { |
| success = !success; |
| } |
| } else { |
| const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| if (s->contains(c)) { |
| success = !success; |
| } |
| } |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| // the character wasn't in the set. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_STAT_SETREF_N: |
| { |
| // Test input character for NOT being a member of one of |
| // the predefined sets (Word Characters, for example) |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (c < 256) { |
| Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| if (s8->contains(c) == FALSE) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| break; |
| } |
| } else { |
| const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| if (s->contains(c) == FALSE) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| break; |
| } |
| } |
| // the character wasn't in the set. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_SETREF: |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } else { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // There is input left. Pick up one char and test it for set membership. |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| U_ASSERT(opValue > 0 && opValue < sets->size()); |
| if (c<256) { |
| Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| if (s8->contains(c)) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| break; |
| } |
| } else { |
| UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| if (s->contains(c)) { |
| // The character is in the set. A Match. |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| break; |
| } |
| } |
| |
| // the character wasn't in the set. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_DOTANY: |
| { |
| // . matches anything, but stops at end-of-line. |
| if (fp->fInputIdx >= fActiveLimit) { |
| // At end of input. Match failed. Backtrack out. |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // There is input left. Advance over one char, unless we've hit end-of-line |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (isLineTerminator(c)) { |
| // End of line in normal mode. . does not match. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| break; |
| |
| |
| case URX_DOTANY_ALL: |
| { |
| // ., in dot-matches-all (including new lines) mode |
| if (fp->fInputIdx >= fActiveLimit) { |
| // At end of input. Match failed. Backtrack out. |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // There is input left. Advance over one char, except if we are |
| // at a cr/lf, advance over both of them. |
| UChar32 c; |
| c = UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
| // In the case of a CR/LF, we need to advance over both. |
| UChar32 nextc = UTEXT_CURRENT32(fInputText); |
| if (nextc == 0x0a) { |
| (void)UTEXT_NEXT32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } |
| } |
| break; |
| |
| |
| case URX_DOTANY_UNIX: |
| { |
| // '.' operator, matches all, but stops at end-of-line. |
| // UNIX_LINES mode, so 0x0a is the only recognized line ending. |
| if (fp->fInputIdx >= fActiveLimit) { |
| // At end of input. Match failed. Backtrack out. |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // There is input left. Advance over one char, unless we've hit end-of-line |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (c == 0x0a) { |
| // End of line in normal mode. '.' does not match the \n |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } else { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } |
| break; |
| |
| |
| case URX_JMP: |
| fp->fPatIdx = opValue; |
| break; |
| |
| case URX_FAIL: |
| isMatch = FALSE; |
| goto breakFromLoop; |
| |
| case URX_JMP_SAV: |
| U_ASSERT(opValue < fPattern->fCompiledPat->size()); |
| fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current |
| fp->fPatIdx = opValue; // Then JMP. |
| break; |
| |
| case URX_JMP_SAV_X: |
| // This opcode is used with (x)+, when x can match a zero length string. |
| // Same as JMP_SAV, except conditional on the match having made forward progress. |
| // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the |
| // data address of the input position at the start of the loop. |
| { |
| U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); |
| int32_t stoOp = (int32_t)pat[opValue-1]; |
| U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
| int32_t frameLoc = URX_VAL(stoOp); |
| U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
| int64_t prevInputIdx = fp->fExtra[frameLoc]; |
| U_ASSERT(prevInputIdx <= fp->fInputIdx); |
| if (prevInputIdx < fp->fInputIdx) { |
| // The match did make progress. Repeat the loop. |
| fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current |
| fp->fPatIdx = opValue; |
| fp->fExtra[frameLoc] = fp->fInputIdx; |
| } |
| // If the input position did not advance, we do nothing here, |
| // execution will fall out of the loop. |
| } |
| break; |
| |
| case URX_CTR_INIT: |
| { |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| fp->fExtra[opValue] = 0; // Set the loop counter variable to zero |
| |
| // Pick up the three extra operands that CTR_INIT has, and |
| // skip the pattern location counter past |
| int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| fp->fPatIdx += 3; |
| int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| U_ASSERT(minCount>=0); |
| U_ASSERT(maxCount>=minCount || maxCount==-1); |
| U_ASSERT(loopLoc>=fp->fPatIdx); |
| |
| if (minCount == 0) { |
| fp = StateSave(fp, loopLoc+1, status); |
| } |
| if (maxCount == -1) { |
| fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. |
| } else if (maxCount == 0) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| case URX_CTR_LOOP: |
| { |
| U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| int32_t initOp = (int32_t)pat[opValue]; |
| U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); |
| int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| int32_t minCount = (int32_t)pat[opValue+2]; |
| int32_t maxCount = (int32_t)pat[opValue+3]; |
| (*pCounter)++; |
| if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { |
| U_ASSERT(*pCounter == maxCount); |
| break; |
| } |
| if (*pCounter >= minCount) { |
| if (maxCount == -1) { |
| // Loop has no hard upper bound. |
| // Check that it is progressing through the input, break if it is not. |
| int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; |
| if (fp->fInputIdx == *pLastInputIdx) { |
| break; |
| } else { |
| *pLastInputIdx = fp->fInputIdx; |
| } |
| } |
| fp = StateSave(fp, fp->fPatIdx, status); |
| } |
| fp->fPatIdx = opValue + 4; // Loop back. |
| } |
| break; |
| |
| case URX_CTR_INIT_NG: |
| { |
| // Initialize a non-greedy loop |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| fp->fExtra[opValue] = 0; // Set the loop counter variable to zero |
| |
| // Pick up the three extra operands that CTR_INIT_NG has, and |
| // skip the pattern location counter past |
| int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| fp->fPatIdx += 3; |
| int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| U_ASSERT(minCount>=0); |
| U_ASSERT(maxCount>=minCount || maxCount==-1); |
| U_ASSERT(loopLoc>fp->fPatIdx); |
| if (maxCount == -1) { |
| fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. |
| } |
| |
| if (minCount == 0) { |
| if (maxCount != 0) { |
| fp = StateSave(fp, fp->fPatIdx, status); |
| } |
| fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block |
| } |
| } |
| break; |
| |
| case URX_CTR_LOOP_NG: |
| { |
| // Non-greedy {min, max} loops |
| U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| int32_t initOp = (int32_t)pat[opValue]; |
| U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
| int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| int32_t minCount = (int32_t)pat[opValue+2]; |
| int32_t maxCount = (int32_t)pat[opValue+3]; |
| |
| (*pCounter)++; |
| if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { |
| // The loop has matched the maximum permitted number of times. |
| // Break out of here with no action. Matching will |
| // continue with the following pattern. |
| U_ASSERT(*pCounter == maxCount); |
| break; |
| } |
| |
| if (*pCounter < minCount) { |
| // We haven't met the minimum number of matches yet. |
| // Loop back for another one. |
| fp->fPatIdx = opValue + 4; // Loop back. |
| } else { |
| // We do have the minimum number of matches. |
| |
| // If there is no upper bound on the loop iterations, check that the input index |
| // is progressing, and stop the loop if it is not. |
| if (maxCount == -1) { |
| int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; |
| if (fp->fInputIdx == *pLastInputIdx) { |
| break; |
| } |
| *pLastInputIdx = fp->fInputIdx; |
| } |
| |
| // Loop Continuation: we will fall into the pattern following the loop |
| // (non-greedy, don't execute loop body first), but first do |
| // a state save to the top of the loop, so that a match failure |
| // in the following pattern will try another iteration of the loop. |
| fp = StateSave(fp, opValue + 4, status); |
| } |
| } |
| break; |
| |
| case URX_STO_SP: |
| U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| fData[opValue] = fStack->size(); |
| break; |
| |
| case URX_LD_SP: |
| { |
| U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| int32_t newStackSize = (int32_t)fData[opValue]; |
| U_ASSERT(newStackSize <= fStack->size()); |
| int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; |
| if (newFP == (int64_t *)fp) { |
| break; |
| } |
| int32_t i; |
| for (i=0; i<fFrameSize; i++) { |
| newFP[i] = ((int64_t *)fp)[i]; |
| } |
| fp = (REStackFrame *)newFP; |
| fStack->setSize(newStackSize); |
| } |
| break; |
| |
| case URX_BACKREF: |
| { |
| U_ASSERT(opValue < fFrameSize); |
| int64_t groupStartIdx = fp->fExtra[opValue]; |
| int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| U_ASSERT(groupStartIdx <= groupEndIdx); |
| if (groupStartIdx < 0) { |
| // This capture group has not participated in the match thus far, |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. |
| break; |
| } |
| UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| // Note: if the capture group match was of an empty string the backref |
| // match succeeds. Verified by testing: Perl matches succeed |
| // in this case, so we do too. |
| |
| UBool success = TRUE; |
| for (;;) { |
| if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { |
| success = TRUE; |
| break; |
| } |
| if (utext_getNativeIndex(fInputText) >= fActiveLimit) { |
| success = FALSE; |
| fHitEnd = TRUE; |
| break; |
| } |
| UChar32 captureGroupChar = utext_next32(fAltInputText); |
| UChar32 inputChar = utext_next32(fInputText); |
| if (inputChar != captureGroupChar) { |
| success = FALSE; |
| break; |
| } |
| } |
| |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| |
| case URX_BACKREF_I: |
| { |
| U_ASSERT(opValue < fFrameSize); |
| int64_t groupStartIdx = fp->fExtra[opValue]; |
| int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| U_ASSERT(groupStartIdx <= groupEndIdx); |
| if (groupStartIdx < 0) { |
| // This capture group has not participated in the match thus far, |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. |
| break; |
| } |
| utext_setNativeIndex(fAltInputText, groupStartIdx); |
| utext_setNativeIndex(fInputText, fp->fInputIdx); |
| CaseFoldingUTextIterator captureGroupItr(*fAltInputText); |
| CaseFoldingUTextIterator inputItr(*fInputText); |
| |
| // Note: if the capture group match was of an empty string the backref |
| // match succeeds. Verified by testing: Perl matches succeed |
| // in this case, so we do too. |
| |
| UBool success = TRUE; |
| for (;;) { |
| if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { |
| success = TRUE; |
| break; |
| } |
| if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { |
| success = FALSE; |
| fHitEnd = TRUE; |
| break; |
| } |
| UChar32 captureGroupChar = captureGroupItr.next(); |
| UChar32 inputChar = inputItr.next(); |
| if (inputChar != captureGroupChar) { |
| success = FALSE; |
| break; |
| } |
| } |
| |
| if (success && inputItr.inExpansion()) { |
| // We otained a match by consuming part of a string obtained from |
| // case-folding a single code point of the input text. |
| // This does not count as an overall match. |
| success = FALSE; |
| } |
| |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| |
| } |
| break; |
| |
| case URX_STO_INP_LOC: |
| { |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
| fp->fExtra[opValue] = fp->fInputIdx; |
| } |
| break; |
| |
| case URX_JMPX: |
| { |
| int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| fp->fPatIdx += 1; |
| int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); |
| U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); |
| int64_t savedInputIdx = fp->fExtra[dataLoc]; |
| U_ASSERT(savedInputIdx <= fp->fInputIdx); |
| if (savedInputIdx < fp->fInputIdx) { |
| fp->fPatIdx = opValue; // JMP |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. |
| } |
| } |
| break; |
| |
| case URX_LA_START: |
| { |
| // Entering a lookahead block. |
| // Save Stack Ptr, Input Pos. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| fData[opValue] = fStack->size(); |
| fData[opValue+1] = fp->fInputIdx; |
| fActiveStart = fLookStart; // Set the match region change for |
| fActiveLimit = fLookLimit; // transparent bounds. |
| } |
| break; |
| |
| case URX_LA_END: |
| { |
| // Leaving a look-ahead block. |
| // restore Stack Ptr, Input Pos to positions they had on entry to block. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int32_t stackSize = fStack->size(); |
| int32_t newStackSize =(int32_t)fData[opValue]; |
| U_ASSERT(stackSize >= newStackSize); |
| if (stackSize > newStackSize) { |
| // Copy the current top frame back to the new (cut back) top frame. |
| // This makes the capture groups from within the look-ahead |
| // expression available. |
| int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; |
| int32_t i; |
| for (i=0; i<fFrameSize; i++) { |
| newFP[i] = ((int64_t *)fp)[i]; |
| } |
| fp = (REStackFrame *)newFP; |
| fStack->setSize(newStackSize); |
| } |
| fp->fInputIdx = fData[opValue+1]; |
| |
| // Restore the active region bounds in the input string; they may have |
| // been changed because of transparent bounds on a Region. |
| fActiveStart = fRegionStart; |
| fActiveLimit = fRegionLimit; |
| } |
| break; |
| |
| case URX_ONECHAR_I: |
| // Case insensitive one char. The char from the pattern is already case folded. |
| // Input text is not, but case folding the input can not reduce two or more code |
| // points to one. |
| if (fp->fInputIdx < fActiveLimit) { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| break; |
| } |
| } else { |
| fHitEnd = TRUE; |
| } |
| |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| case URX_STRING_I: |
| { |
| // Case-insensitive test input against a literal string. |
| // Strings require two slots in the compiled pattern, one for the |
| // offset to the string text, and one for the length. |
| // The compiled string has already been case folded. |
| { |
| const UChar *patternString = litText + opValue; |
| int32_t patternStringIdx = 0; |
| |
| op = (int32_t)pat[fp->fPatIdx]; |
| fp->fPatIdx++; |
| opType = URX_TYPE(op); |
| opValue = URX_VAL(op); |
| U_ASSERT(opType == URX_STRING_LEN); |
| int32_t patternStringLen = opValue; // Length of the string from the pattern. |
| |
| |
| UChar32 cPattern; |
| UChar32 cText; |
| UBool success = TRUE; |
| |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| CaseFoldingUTextIterator inputIterator(*fInputText); |
| while (patternStringIdx < patternStringLen) { |
| if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { |
| success = FALSE; |
| fHitEnd = TRUE; |
| break; |
| } |
| U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); |
| cText = inputIterator.next(); |
| if (cText != cPattern) { |
| success = FALSE; |
| break; |
| } |
| } |
| if (inputIterator.inExpansion()) { |
| success = FALSE; |
| } |
| |
| if (success) { |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| } |
| break; |
| |
| case URX_LB_START: |
| { |
| // Entering a look-behind block. |
| // Save Stack Ptr, Input Pos. |
| // TODO: implement transparent bounds. Ticket #6067 |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| fData[opValue] = fStack->size(); |
| fData[opValue+1] = fp->fInputIdx; |
| // Init the variable containing the start index for attempted matches. |
| fData[opValue+2] = -1; |
| // Save input string length, then reset to pin any matches to end at |
| // the current position. |
| fData[opValue+3] = fActiveLimit; |
| fActiveLimit = fp->fInputIdx; |
| } |
| break; |
| |
| |
| case URX_LB_CONT: |
| { |
| // Positive Look-Behind, at top of loop checking for matches of LB expression |
| // at all possible input starting positions. |
| |
| // Fetch the min and max possible match lengths. They are the operands |
| // of this op in the pattern. |
| int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| U_ASSERT(minML <= maxML); |
| U_ASSERT(minML >= 0); |
| |
| // Fetch (from data) the last input index where a match was attempted. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int64_t *lbStartIdx = &fData[opValue+2]; |
| if (*lbStartIdx < 0) { |
| // First time through loop. |
| *lbStartIdx = fp->fInputIdx - minML; |
| } else { |
| // 2nd through nth time through the loop. |
| // Back up start position for match by one. |
| if (*lbStartIdx == 0) { |
| (*lbStartIdx)--; |
| } else { |
| UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); |
| (void)UTEXT_PREVIOUS32(fInputText); |
| *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } |
| |
| if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| // We have tried all potential match starting points without |
| // getting a match. Backtrack out, and out of the |
| // Look Behind altogether. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| int64_t restoreInputLen = fData[opValue+3]; |
| U_ASSERT(restoreInputLen >= fActiveLimit); |
| U_ASSERT(restoreInputLen <= fInputLength); |
| fActiveLimit = restoreInputLen; |
| break; |
| } |
| |
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
| // (successful match will fall off the end of the loop.) |
| fp = StateSave(fp, fp->fPatIdx-3, status); |
| fp->fInputIdx = *lbStartIdx; |
| } |
| break; |
| |
| case URX_LB_END: |
| // End of a look-behind block, after a successful match. |
| { |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| if (fp->fInputIdx != fActiveLimit) { |
| // The look-behind expression matched, but the match did not |
| // extend all the way to the point that we are looking behind from. |
| // FAIL out of here, which will take us back to the LB_CONT, which |
| // will retry the match starting at another position or fail |
| // the look-behind altogether, whichever is appropriate. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // Look-behind match is good. Restore the orignal input string length, |
| // which had been truncated to pin the end of the lookbehind match to the |
| // position being looked-behind. |
| int64_t originalInputLen = fData[opValue+3]; |
| U_ASSERT(originalInputLen >= fActiveLimit); |
| U_ASSERT(originalInputLen <= fInputLength); |
| fActiveLimit = originalInputLen; |
| } |
| break; |
| |
| |
| case URX_LBN_CONT: |
| { |
| // Negative Look-Behind, at top of loop checking for matches of LB expression |
| // at all possible input starting positions. |
| |
| // Fetch the extra parameters of this op. |
| int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; |
| continueLoc = URX_VAL(continueLoc); |
| U_ASSERT(minML <= maxML); |
| U_ASSERT(minML >= 0); |
| U_ASSERT(continueLoc > fp->fPatIdx); |
| |
| // Fetch (from data) the last input index where a match was attempted. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int64_t *lbStartIdx = &fData[opValue+2]; |
| if (*lbStartIdx < 0) { |
| // First time through loop. |
| *lbStartIdx = fp->fInputIdx - minML; |
| } else { |
| // 2nd through nth time through the loop. |
| // Back up start position for match by one. |
| if (*lbStartIdx == 0) { |
| (*lbStartIdx)--; |
| } else { |
| UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); |
| (void)UTEXT_PREVIOUS32(fInputText); |
| *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } |
| |
| if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| // We have tried all potential match starting points without |
| // getting a match, which means that the negative lookbehind as |
| // a whole has succeeded. Jump forward to the continue location |
| int64_t restoreInputLen = fData[opValue+3]; |
| U_ASSERT(restoreInputLen >= fActiveLimit); |
| U_ASSERT(restoreInputLen <= fInputLength); |
| fActiveLimit = restoreInputLen; |
| fp->fPatIdx = continueLoc; |
| break; |
| } |
| |
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
| // (successful match will cause a FAIL out of the loop altogether.) |
| fp = StateSave(fp, fp->fPatIdx-4, status); |
| fp->fInputIdx = *lbStartIdx; |
| } |
| break; |
| |
| case URX_LBN_END: |
| // End of a negative look-behind block, after a successful match. |
| { |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| if (fp->fInputIdx != fActiveLimit) { |
| // The look-behind expression matched, but the match did not |
| // extend all the way to the point that we are looking behind from. |
| // FAIL out of here, which will take us back to the LB_CONT, which |
| // will retry the match starting at another position or succeed |
| // the look-behind altogether, whichever is appropriate. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // Look-behind expression matched, which means look-behind test as |
| // a whole Fails |
| |
| // Restore the orignal input string length, which had been truncated |
| // inorder to pin the end of the lookbehind match |
| // to the position being looked-behind. |
| int64_t originalInputLen = fData[opValue+3]; |
| U_ASSERT(originalInputLen >= fActiveLimit); |
| U_ASSERT(originalInputLen <= fInputLength); |
| fActiveLimit = originalInputLen; |
| |
| // Restore original stack position, discarding any state saved |
| // by the successful pattern match. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int32_t newStackSize = (int32_t)fData[opValue]; |
| U_ASSERT(fStack->size() > newStackSize); |
| fStack->setSize(newStackSize); |
| |
| // FAIL, which will take control back to someplace |
| // prior to entering the look-behind test. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_LOOP_SR_I: |
| // Loop Initialization for the optimized implementation of |
| // [some character set]* |
| // This op scans through all matching input. |
| // The following LOOP_C op emulates stack unwinding if the following pattern fails. |
| { |
| U_ASSERT(opValue > 0 && opValue < sets->size()); |
| Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| |
| // Loop through input, until either the input is exhausted or |
| // we reach a character that is not a member of the set. |
| int64_t ix = fp->fInputIdx; |
| UTEXT_SETNATIVEINDEX(fInputText, ix); |
| for (;;) { |
| if (ix >= fActiveLimit) { |
| fHitEnd = TRUE; |
| break; |
| } |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if (c<256) { |
| if (s8->contains(c) == FALSE) { |
| break; |
| } |
| } else { |
| if (s->contains(c) == FALSE) { |
| break; |
| } |
| } |
| ix = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| |
| // If there were no matching characters, skip over the loop altogether. |
| // The loop doesn't run at all, a * op always succeeds. |
| if (ix == fp->fInputIdx) { |
| fp->fPatIdx++; // skip the URX_LOOP_C op. |
| break; |
| } |
| |
| // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| // must follow. It's operand is the stack location |
| // that holds the starting input index for the match of this [set]* |
| int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| int32_t stackLoc = URX_VAL(loopcOp); |
| U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| fp->fExtra[stackLoc] = fp->fInputIdx; |
| fp->fInputIdx = ix; |
| |
| // Save State to the URX_LOOP_C op that follows this one, |
| // so that match failures in the following code will return to there. |
| // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. |
| fp = StateSave(fp, fp->fPatIdx, status); |
| fp->fPatIdx++; |
| } |
| break; |
| |
| |
| case URX_LOOP_DOT_I: |
| // Loop Initialization for the optimized implementation of .* |
| // This op scans through all remaining input. |
| // The following LOOP_C op emulates stack unwinding if the following pattern fails. |
| { |
| // Loop through input until the input is exhausted (we reach an end-of-line) |
| // In DOTALL mode, we can just go straight to the end of the input. |
| int64_t ix; |
| if ((opValue & 1) == 1) { |
| // Dot-matches-All mode. Jump straight to the end of the string. |
| ix = fActiveLimit; |
| fHitEnd = TRUE; |
| } else { |
| // NOT DOT ALL mode. Line endings do not match '.' |
| // Scan forward until a line ending or end of input. |
| ix = fp->fInputIdx; |
| UTEXT_SETNATIVEINDEX(fInputText, ix); |
| for (;;) { |
| if (ix >= fActiveLimit) { |
| fHitEnd = TRUE; |
| break; |
| } |
| UChar32 c = UTEXT_NEXT32(fInputText); |
| if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s |
| if ((c == 0x0a) || // 0x0a is newline in both modes. |
| (((opValue & 2) == 0) && // IF not UNIX_LINES mode |
| isLineTerminator(c))) { |
| // char is a line ending. Exit the scanning loop. |
| break; |
| } |
| } |
| ix = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } |
| |
| // If there were no matching characters, skip over the loop altogether. |
| // The loop doesn't run at all, a * op always succeeds. |
| if (ix == fp->fInputIdx) { |
| fp->fPatIdx++; // skip the URX_LOOP_C op. |
| break; |
| } |
| |
| // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| // must follow. It's operand is the stack location |
| // that holds the starting input index for the match of this .* |
| int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| int32_t stackLoc = URX_VAL(loopcOp); |
| U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| fp->fExtra[stackLoc] = fp->fInputIdx; |
| fp->fInputIdx = ix; |
| |
| // Save State to the URX_LOOP_C op that follows this one, |
| // so that match failures in the following code will return to there. |
| // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. |
| fp = StateSave(fp, fp->fPatIdx, status); |
| fp->fPatIdx++; |
| } |
| break; |
| |
| |
| case URX_LOOP_C: |
| { |
| U_ASSERT(opValue>=0 && opValue<fFrameSize); |
| backSearchIndex = fp->fExtra[opValue]; |
| U_ASSERT(backSearchIndex <= fp->fInputIdx); |
| if (backSearchIndex == fp->fInputIdx) { |
| // We've backed up the input idx to the point that the loop started. |
| // The loop is done. Leave here without saving state. |
| // Subsequent failures won't come back here. |
| break; |
| } |
| // Set up for the next iteration of the loop, with input index |
| // backed up by one from the last time through, |
| // and a state save to this instruction in case the following code fails again. |
| // (We're going backwards because this loop emulates stack unwinding, not |
| // the initial scan forward.) |
| U_ASSERT(fp->fInputIdx > 0); |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| UChar32 prevC = UTEXT_PREVIOUS32(fInputText); |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| |
| UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); |
| if (prevC == 0x0a && |
| fp->fInputIdx > backSearchIndex && |
| twoPrevC == 0x0d) { |
| int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
| if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
| // .*, stepping back over CRLF pair. |
| fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); |
| } |
| } |
| |
| |
| fp = StateSave(fp, fp->fPatIdx-1, status); |
| } |
| break; |
| |
| |
| |
| default: |
| // Trouble. The compiled pattern contains an entry with an |
| // unrecognized type tag. |
| U_ASSERT(FALSE); |
| } |
| |
| if (U_FAILURE(status)) { |
| isMatch = FALSE; |
| break; |
| } |
| } |
| |
| breakFromLoop: |
| fMatch = isMatch; |
| if (isMatch) { |
| fLastMatchEnd = fMatchEnd; |
| fMatchStart = startIdx; |
| fMatchEnd = fp->fInputIdx; |
| } |
| |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) { |
| if (isMatch) { |
| printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); |
| } else { |
| printf("No match\n\n"); |
| } |
| } |
| #endif |
| |
| fFrame = fp; // The active stack frame when the engine stopped. |
| // Contains the capture group results that we need to |
| // access later. |
| return; |
| } |
| |
| |
| //-------------------------------------------------------------------------------- |
| // |
| // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the |
| // assumption that the entire string is available in the UText's |
| // chunk buffer. For now, that means we can use int32_t indexes, |
| // except for anything that needs to be saved (like group starts |
| // and ends). |
| // |
| // startIdx: begin matching a this index. |
| // toEnd: if true, match must extend to end of the input region |
| // |
| //-------------------------------------------------------------------------------- |
| void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { |
| UBool isMatch = FALSE; // True if the we have a match. |
| |
| int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards |
| |
| int32_t op; // Operation from the compiled pattern, split into |
| int32_t opType; // the opcode |
| int32_t opValue; // and the operand value. |
| |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) { |
| printf("MatchAt(startIdx=%d)\n", startIdx); |
| printf("Original Pattern: "); |
| UChar32 c = utext_next32From(fPattern->fPattern, 0); |
| while (c != U_SENTINEL) { |
| if (c<32 || c>256) { |
| c = '.'; |
| } |
| printf("%c", c); |
| |
| c = UTEXT_NEXT32(fPattern->fPattern); |
| } |
| printf("\n"); |
| printf("Input String: "); |
| c = utext_next32From(fInputText, 0); |
| while (c != U_SENTINEL) { |
| if (c<32 || c>256) { |
| c = '.'; |
| } |
| printf("%c", c); |
| |
| c = UTEXT_NEXT32(fInputText); |
| } |
| printf("\n"); |
| printf("\n"); |
| } |
| #endif |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| // Cache frequently referenced items from the compiled pattern |
| // |
| int64_t *pat = fPattern->fCompiledPat->getBuffer(); |
| |
| const UChar *litText = fPattern->fLiteralText.getBuffer(); |
| UVector *sets = fPattern->fSets; |
| |
| const UChar *inputBuf = fInputText->chunkContents; |
| |
| fFrameSize = fPattern->fFrameSize; |
| REStackFrame *fp = resetStack(); |
| if (U_FAILURE(fDeferredStatus)) { |
| status = fDeferredStatus; |
| return; |
| } |
| |
| fp->fPatIdx = 0; |
| fp->fInputIdx = startIdx; |
| |
| // Zero out the pattern's static data |
| int32_t i; |
| for (i = 0; i<fPattern->fDataSize; i++) { |
| fData[i] = 0; |
| } |
| |
| // |
| // Main loop for interpreting the compiled pattern. |
| // One iteration of the loop per pattern operation performed. |
| // |
| for (;;) { |
| op = (int32_t)pat[fp->fPatIdx]; |
| opType = URX_TYPE(op); |
| opValue = URX_VAL(op); |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) { |
| UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); |
| printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, |
| UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); |
| fPattern->dumpOp(fp->fPatIdx); |
| } |
| #endif |
| fp->fPatIdx++; |
| |
| switch (opType) { |
| |
| |
| case URX_NOP: |
| break; |
| |
| |
| case URX_BACKTRACK: |
| // Force a backtrack. In some circumstances, the pattern compiler |
| // will notice that the pattern can't possibly match anything, and will |
| // emit one of these at that point. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_ONECHAR: |
| if (fp->fInputIdx < fActiveLimit) { |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (c == opValue) { |
| break; |
| } |
| } else { |
| fHitEnd = TRUE; |
| } |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_STRING: |
| { |
| // Test input against a literal string. |
| // Strings require two slots in the compiled pattern, one for the |
| // offset to the string text, and one for the length. |
| int32_t stringStartIdx = opValue; |
| int32_t stringLen; |
| |
| op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand |
| fp->fPatIdx++; |
| opType = URX_TYPE(op); |
| stringLen = URX_VAL(op); |
| U_ASSERT(opType == URX_STRING_LEN); |
| U_ASSERT(stringLen >= 2); |
| |
| const UChar * pInp = inputBuf + fp->fInputIdx; |
| const UChar * pInpLimit = inputBuf + fActiveLimit; |
| const UChar * pPat = litText+stringStartIdx; |
| const UChar * pEnd = pInp + stringLen; |
| UBool success = TRUE; |
| while (pInp < pEnd) { |
| if (pInp >= pInpLimit) { |
| fHitEnd = TRUE; |
| success = FALSE; |
| break; |
| } |
| if (*pInp++ != *pPat++) { |
| success = FALSE; |
| break; |
| } |
| } |
| |
| if (success) { |
| fp->fInputIdx += stringLen; |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_STATE_SAVE: |
| fp = StateSave(fp, opValue, status); |
| break; |
| |
| |
| case URX_END: |
| // The match loop will exit via this path on a successful match, |
| // when we reach the end of the pattern. |
| if (toEnd && fp->fInputIdx != fActiveLimit) { |
| // The pattern matched, but not to the end of input. Try some more. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| isMatch = TRUE; |
| goto breakFromLoop; |
| |
| // Start and End Capture stack frame variables are laid out out like this: |
| // fp->fExtra[opValue] - The start of a completed capture group |
| // opValue+1 - The end of a completed capture group |
| // opValue+2 - the start of a capture group whose end |
| // has not yet been reached (and might not ever be). |
| case URX_START_CAPTURE: |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| fp->fExtra[opValue+2] = fp->fInputIdx; |
| break; |
| |
| |
| case URX_END_CAPTURE: |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); |
| U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. |
| fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. |
| fp->fExtra[opValue+1] = fp->fInputIdx; // End position |
| U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); |
| break; |
| |
| |
| case URX_DOLLAR: // $, test for End of line |
| // or for position before new line at end of input |
| if (fp->fInputIdx < fAnchorLimit-2) { |
| // We are no where near the end of input. Fail. |
| // This is the common case. Keep it first. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| |
| // If we are positioned just before a new-line that is located at the |
| // end of input, succeed. |
| if (fp->fInputIdx == fAnchorLimit-1) { |
| UChar32 c; |
| U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); |
| |
| if (isLineTerminator(c)) { |
| if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { |
| // At new-line at end of input. Success |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| } |
| } else if (fp->fInputIdx == fAnchorLimit-2 && |
| inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; // At CR/LF at end of input. Success |
| } |
| |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| |
| break; |
| |
| |
| case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. |
| if (fp->fInputIdx >= fAnchorLimit-1) { |
| // Either at the last character of input, or off the end. |
| if (fp->fInputIdx == fAnchorLimit-1) { |
| // At last char of input. Success if it's a new line. |
| if (inputBuf[fp->fInputIdx] == 0x0a) { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| } else { |
| // Off the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| } |
| |
| // Not at end of input. Back-track out. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| |
| case URX_DOLLAR_M: // $, test for End of line in multi-line mode |
| { |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| break; |
| } |
| // If we are positioned just before a new-line, succeed. |
| // It makes no difference where the new-line is within the input. |
| UChar32 c = inputBuf[fp->fInputIdx]; |
| if (isLineTerminator(c)) { |
| // At a line end, except for the odd chance of being in the middle of a CR/LF sequence |
| // In multi-line mode, hitting a new-line just before the end of input does not |
| // set the hitEnd or requireEnd flags |
| if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { |
| break; |
| } |
| } |
| // not at a new line. Fail. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode |
| { |
| if (fp->fInputIdx >= fAnchorLimit) { |
| // We really are at the end of input. Success. |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; // Java set requireEnd in this case, even though |
| break; // adding a new-line would not lose the match. |
| } |
| // If we are not positioned just before a new-line, the test fails; backtrack out. |
| // It makes no difference where the new-line is within the input. |
| if (inputBuf[fp->fInputIdx] != 0x0a) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_CARET: // ^, test for start of line |
| if (fp->fInputIdx != fAnchorStart) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_CARET_M: // ^, test for start of line in mulit-line mode |
| { |
| if (fp->fInputIdx == fAnchorStart) { |
| // We are at the start input. Success. |
| break; |
| } |
| // Check whether character just before the current pos is a new-line |
| // unless we are at the end of input |
| UChar c = inputBuf[fp->fInputIdx - 1]; |
| if ((fp->fInputIdx < fAnchorLimit) && |
| isLineTerminator(c)) { |
| // It's a new-line. ^ is true. Success. |
| // TODO: what should be done with positions between a CR and LF? |
| break; |
| } |
| // Not at the start of a line. Fail. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode |
| { |
| U_ASSERT(fp->fInputIdx >= fAnchorStart); |
| if (fp->fInputIdx <= fAnchorStart) { |
| // We are at the start input. Success. |
| break; |
| } |
| // Check whether character just before the current pos is a new-line |
| U_ASSERT(fp->fInputIdx <= fAnchorLimit); |
| UChar c = inputBuf[fp->fInputIdx - 1]; |
| if (c != 0x0a) { |
| // Not at the start of a line. Back-track out. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| case URX_BACKSLASH_B: // Test for word boundaries |
| { |
| UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); |
| success ^= (UBool)(opValue != 0); // flip sense for \B |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style |
| { |
| UBool success = isUWordBoundary(fp->fInputIdx); |
| success ^= (UBool)(opValue != 0); // flip sense for \B |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_D: // Test for decimal digit |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. |
| UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); |
| success ^= (UBool)(opValue != 0); // flip sense for \D |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_G: // Test for position at end of previous match |
| if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_H: // Test for \h, horizontal white space. |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| int8_t ctype = u_charType(c); |
| UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB |
| success ^= (UBool)(opValue != 0); // flip sense for \H |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_R: // Test for \R, any line break sequence. |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (isLineTerminator(c)) { |
| if (c == 0x0d && fp->fInputIdx < fActiveLimit) { |
| // Check for CR/LF sequence. Consume both together when found. |
| UChar c2; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2); |
| if (c2 != 0x0a) { |
| U16_PREV(inputBuf, 0, fp->fInputIdx, c2); |
| } |
| } |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_BACKSLASH_V: // Any single code point line ending. |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| UBool success = isLineTerminator(c); |
| success ^= (UBool)(opValue != 0); // flip sense for \V |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| |
| case URX_BACKSLASH_X: |
| // Match a Grapheme, as defined by Unicode TR 29. |
| // Differs slightly from Perl, which consumes combining marks independently |
| // of context. |
| { |
| |
| // Fail if at end of input |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // Examine (and consume) the current char. |
| // Dispatch into a little state machine, based on the char. |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| UnicodeSet **sets = fPattern->fStaticSets; |
| if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; |
| if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; |
| if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| goto GC_Extend; |
| |
| |
| |
| GC_L: |
| if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (sets[URX_GC_L]->contains(c)) goto GC_L; |
| if (sets[URX_GC_LV]->contains(c)) goto GC_V; |
| if (sets[URX_GC_LVT]->contains(c)) goto GC_T; |
| if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| U16_PREV(inputBuf, 0, fp->fInputIdx, c); |
| goto GC_Extend; |
| |
| GC_V: |
| if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (sets[URX_GC_V]->contains(c)) goto GC_V; |
| if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| U16_PREV(inputBuf, 0, fp->fInputIdx, c); |
| goto GC_Extend; |
| |
| GC_T: |
| if (fp->fInputIdx >= fActiveLimit) goto GC_Done; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (sets[URX_GC_T]->contains(c)) goto GC_T; |
| U16_PREV(inputBuf, 0, fp->fInputIdx, c); |
| goto GC_Extend; |
| |
| GC_Extend: |
| // Combining characters are consumed here |
| for (;;) { |
| if (fp->fInputIdx >= fActiveLimit) { |
| break; |
| } |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { |
| U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
| break; |
| } |
| } |
| goto GC_Done; |
| |
| GC_Control: |
| // Most control chars stand alone (don't combine with combining chars), |
| // except for that CR/LF sequence is a single grapheme cluster. |
| if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { |
| fp->fInputIdx++; |
| } |
| |
| GC_Done: |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| } |
| break; |
| } |
| |
| |
| |
| |
| case URX_BACKSLASH_Z: // Test for end of Input |
| if (fp->fInputIdx < fAnchorLimit) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } else { |
| fHitEnd = TRUE; |
| fRequireEnd = TRUE; |
| } |
| break; |
| |
| |
| |
| case URX_STATIC_SETREF: |
| { |
| // Test input character against one of the predefined sets |
| // (Word Characters, for example) |
| // The high bit of the op value is a flag for the match polarity. |
| // 0: success if input char is in set. |
| // 1: success if input char is not in set. |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); |
| opValue &= ~URX_NEG_SET; |
| U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (c < 256) { |
| Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| if (s8->contains(c)) { |
| success = !success; |
| } |
| } else { |
| const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| if (s->contains(c)) { |
| success = !success; |
| } |
| } |
| if (!success) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_STAT_SETREF_N: |
| { |
| // Test input character for NOT being a member of one of |
| // the predefined sets (Word Characters, for example) |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); |
| |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (c < 256) { |
| Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; |
| if (s8->contains(c) == FALSE) { |
| break; |
| } |
| } else { |
| const UnicodeSet *s = fPattern->fStaticSets[opValue]; |
| if (s->contains(c) == FALSE) { |
| break; |
| } |
| } |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_SETREF: |
| { |
| if (fp->fInputIdx >= fActiveLimit) { |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| U_ASSERT(opValue > 0 && opValue < sets->size()); |
| |
| // There is input left. Pick up one char and test it for set membership. |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (c<256) { |
| Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| if (s8->contains(c)) { |
| // The character is in the set. A Match. |
| break; |
| } |
| } else { |
| UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| if (s->contains(c)) { |
| // The character is in the set. A Match. |
| break; |
| } |
| } |
| |
| // the character wasn't in the set. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_DOTANY: |
| { |
| // . matches anything, but stops at end-of-line. |
| if (fp->fInputIdx >= fActiveLimit) { |
| // At end of input. Match failed. Backtrack out. |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // There is input left. Advance over one char, unless we've hit end-of-line |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (isLineTerminator(c)) { |
| // End of line in normal mode. . does not match. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| } |
| break; |
| |
| |
| case URX_DOTANY_ALL: |
| { |
| // . in dot-matches-all (including new lines) mode |
| if (fp->fInputIdx >= fActiveLimit) { |
| // At end of input. Match failed. Backtrack out. |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // There is input left. Advance over one char, except if we are |
| // at a cr/lf, advance over both of them. |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (c==0x0d && fp->fInputIdx < fActiveLimit) { |
| // In the case of a CR/LF, we need to advance over both. |
| if (inputBuf[fp->fInputIdx] == 0x0a) { |
| U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); |
| } |
| } |
| } |
| break; |
| |
| |
| case URX_DOTANY_UNIX: |
| { |
| // '.' operator, matches all, but stops at end-of-line. |
| // UNIX_LINES mode, so 0x0a is the only recognized line ending. |
| if (fp->fInputIdx >= fActiveLimit) { |
| // At end of input. Match failed. Backtrack out. |
| fHitEnd = TRUE; |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // There is input left. Advance over one char, unless we've hit end-of-line |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (c == 0x0a) { |
| // End of line in normal mode. '.' does not match the \n |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| |
| case URX_JMP: |
| fp->fPatIdx = opValue; |
| break; |
| |
| case URX_FAIL: |
| isMatch = FALSE; |
| goto breakFromLoop; |
| |
| case URX_JMP_SAV: |
| U_ASSERT(opValue < fPattern->fCompiledPat->size()); |
| fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current |
| fp->fPatIdx = opValue; // Then JMP. |
| break; |
| |
| case URX_JMP_SAV_X: |
| // This opcode is used with (x)+, when x can match a zero length string. |
| // Same as JMP_SAV, except conditional on the match having made forward progress. |
| // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the |
| // data address of the input position at the start of the loop. |
| { |
| U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); |
| int32_t stoOp = (int32_t)pat[opValue-1]; |
| U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); |
| int32_t frameLoc = URX_VAL(stoOp); |
| U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); |
| int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; |
| U_ASSERT(prevInputIdx <= fp->fInputIdx); |
| if (prevInputIdx < fp->fInputIdx) { |
| // The match did make progress. Repeat the loop. |
| fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current |
| fp->fPatIdx = opValue; |
| fp->fExtra[frameLoc] = fp->fInputIdx; |
| } |
| // If the input position did not advance, we do nothing here, |
| // execution will fall out of the loop. |
| } |
| break; |
| |
| case URX_CTR_INIT: |
| { |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| fp->fExtra[opValue] = 0; // Set the loop counter variable to zero |
| |
| // Pick up the three extra operands that CTR_INIT has, and |
| // skip the pattern location counter past |
| int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| fp->fPatIdx += 3; |
| int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| U_ASSERT(minCount>=0); |
| U_ASSERT(maxCount>=minCount || maxCount==-1); |
| U_ASSERT(loopLoc>=fp->fPatIdx); |
| |
| if (minCount == 0) { |
| fp = StateSave(fp, loopLoc+1, status); |
| } |
| if (maxCount == -1) { |
| fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. |
| } else if (maxCount == 0) { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| case URX_CTR_LOOP: |
| { |
| U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| int32_t initOp = (int32_t)pat[opValue]; |
| U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); |
| int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| int32_t minCount = (int32_t)pat[opValue+2]; |
| int32_t maxCount = (int32_t)pat[opValue+3]; |
| (*pCounter)++; |
| if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { |
| U_ASSERT(*pCounter == maxCount); |
| break; |
| } |
| if (*pCounter >= minCount) { |
| if (maxCount == -1) { |
| // Loop has no hard upper bound. |
| // Check that it is progressing through the input, break if it is not. |
| int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; |
| if (fp->fInputIdx == *pLastInputIdx) { |
| break; |
| } else { |
| *pLastInputIdx = fp->fInputIdx; |
| } |
| } |
| fp = StateSave(fp, fp->fPatIdx, status); |
| } |
| fp->fPatIdx = opValue + 4; // Loop back. |
| } |
| break; |
| |
| case URX_CTR_INIT_NG: |
| { |
| // Initialize a non-greedy loop |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); |
| fp->fExtra[opValue] = 0; // Set the loop counter variable to zero |
| |
| // Pick up the three extra operands that CTR_INIT_NG has, and |
| // skip the pattern location counter past |
| int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| fp->fPatIdx += 3; |
| int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); |
| int32_t minCount = (int32_t)pat[instrOperandLoc+1]; |
| int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; |
| U_ASSERT(minCount>=0); |
| U_ASSERT(maxCount>=minCount || maxCount==-1); |
| U_ASSERT(loopLoc>fp->fPatIdx); |
| if (maxCount == -1) { |
| fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. |
| } |
| |
| if (minCount == 0) { |
| if (maxCount != 0) { |
| fp = StateSave(fp, fp->fPatIdx, status); |
| } |
| fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block |
| } |
| } |
| break; |
| |
| case URX_CTR_LOOP_NG: |
| { |
| // Non-greedy {min, max} loops |
| U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); |
| int32_t initOp = (int32_t)pat[opValue]; |
| U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); |
| int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; |
| int32_t minCount = (int32_t)pat[opValue+2]; |
| int32_t maxCount = (int32_t)pat[opValue+3]; |
| |
| (*pCounter)++; |
| if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { |
| // The loop has matched the maximum permitted number of times. |
| // Break out of here with no action. Matching will |
| // continue with the following pattern. |
| U_ASSERT(*pCounter == maxCount); |
| break; |
| } |
| |
| if (*pCounter < minCount) { |
| // We haven't met the minimum number of matches yet. |
| // Loop back for another one. |
| fp->fPatIdx = opValue + 4; // Loop back. |
| } else { |
| // We do have the minimum number of matches. |
| |
| // If there is no upper bound on the loop iterations, check that the input index |
| // is progressing, and stop the loop if it is not. |
| if (maxCount == -1) { |
| int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; |
| if (fp->fInputIdx == *pLastInputIdx) { |
| break; |
| } |
| *pLastInputIdx = fp->fInputIdx; |
| } |
| |
| // Loop Continuation: we will fall into the pattern following the loop |
| // (non-greedy, don't execute loop body first), but first do |
| // a state save to the top of the loop, so that a match failure |
| // in the following pattern will try another iteration of the loop. |
| fp = StateSave(fp, opValue + 4, status); |
| } |
| } |
| break; |
| |
| case URX_STO_SP: |
| U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| fData[opValue] = fStack->size(); |
| break; |
| |
| case URX_LD_SP: |
| { |
| U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); |
| int32_t newStackSize = (int32_t)fData[opValue]; |
| U_ASSERT(newStackSize <= fStack->size()); |
| int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; |
| if (newFP == (int64_t *)fp) { |
| break; |
| } |
| int32_t i; |
| for (i=0; i<fFrameSize; i++) { |
| newFP[i] = ((int64_t *)fp)[i]; |
| } |
| fp = (REStackFrame *)newFP; |
| fStack->setSize(newStackSize); |
| } |
| break; |
| |
| case URX_BACKREF: |
| { |
| U_ASSERT(opValue < fFrameSize); |
| int64_t groupStartIdx = fp->fExtra[opValue]; |
| int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| U_ASSERT(groupStartIdx <= groupEndIdx); |
| int64_t inputIndex = fp->fInputIdx; |
| if (groupStartIdx < 0) { |
| // This capture group has not participated in the match thus far, |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. |
| break; |
| } |
| UBool success = TRUE; |
| for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { |
| if (inputIndex >= fActiveLimit) { |
| success = FALSE; |
| fHitEnd = TRUE; |
| break; |
| } |
| if (inputBuf[groupIndex] != inputBuf[inputIndex]) { |
| success = FALSE; |
| break; |
| } |
| } |
| if (success) { |
| fp->fInputIdx = inputIndex; |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| case URX_BACKREF_I: |
| { |
| U_ASSERT(opValue < fFrameSize); |
| int64_t groupStartIdx = fp->fExtra[opValue]; |
| int64_t groupEndIdx = fp->fExtra[opValue+1]; |
| U_ASSERT(groupStartIdx <= groupEndIdx); |
| if (groupStartIdx < 0) { |
| // This capture group has not participated in the match thus far, |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. |
| break; |
| } |
| CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); |
| CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); |
| |
| // Note: if the capture group match was of an empty string the backref |
| // match succeeds. Verified by testing: Perl matches succeed |
| // in this case, so we do too. |
| |
| UBool success = TRUE; |
| for (;;) { |
| UChar32 captureGroupChar = captureGroupItr.next(); |
| if (captureGroupChar == U_SENTINEL) { |
| success = TRUE; |
| break; |
| } |
| UChar32 inputChar = inputItr.next(); |
| if (inputChar == U_SENTINEL) { |
| success = FALSE; |
| fHitEnd = TRUE; |
| break; |
| } |
| if (inputChar != captureGroupChar) { |
| success = FALSE; |
| break; |
| } |
| } |
| |
| if (success && inputItr.inExpansion()) { |
| // We otained a match by consuming part of a string obtained from |
| // case-folding a single code point of the input text. |
| // This does not count as an overall match. |
| success = FALSE; |
| } |
| |
| if (success) { |
| fp->fInputIdx = inputItr.getIndex(); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| case URX_STO_INP_LOC: |
| { |
| U_ASSERT(opValue >= 0 && opValue < fFrameSize); |
| fp->fExtra[opValue] = fp->fInputIdx; |
| } |
| break; |
| |
| case URX_JMPX: |
| { |
| int32_t instrOperandLoc = (int32_t)fp->fPatIdx; |
| fp->fPatIdx += 1; |
| int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); |
| U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); |
| int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; |
| U_ASSERT(savedInputIdx <= fp->fInputIdx); |
| if (savedInputIdx < fp->fInputIdx) { |
| fp->fPatIdx = opValue; // JMP |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. |
| } |
| } |
| break; |
| |
| case URX_LA_START: |
| { |
| // Entering a lookahead block. |
| // Save Stack Ptr, Input Pos. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| fData[opValue] = fStack->size(); |
| fData[opValue+1] = fp->fInputIdx; |
| fActiveStart = fLookStart; // Set the match region change for |
| fActiveLimit = fLookLimit; // transparent bounds. |
| } |
| break; |
| |
| case URX_LA_END: |
| { |
| // Leaving a look-ahead block. |
| // restore Stack Ptr, Input Pos to positions they had on entry to block. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int32_t stackSize = fStack->size(); |
| int32_t newStackSize = (int32_t)fData[opValue]; |
| U_ASSERT(stackSize >= newStackSize); |
| if (stackSize > newStackSize) { |
| // Copy the current top frame back to the new (cut back) top frame. |
| // This makes the capture groups from within the look-ahead |
| // expression available. |
| int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; |
| int32_t i; |
| for (i=0; i<fFrameSize; i++) { |
| newFP[i] = ((int64_t *)fp)[i]; |
| } |
| fp = (REStackFrame *)newFP; |
| fStack->setSize(newStackSize); |
| } |
| fp->fInputIdx = fData[opValue+1]; |
| |
| // Restore the active region bounds in the input string; they may have |
| // been changed because of transparent bounds on a Region. |
| fActiveStart = fRegionStart; |
| fActiveLimit = fRegionLimit; |
| } |
| break; |
| |
| case URX_ONECHAR_I: |
| if (fp->fInputIdx < fActiveLimit) { |
| UChar32 c; |
| U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); |
| if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { |
| break; |
| } |
| } else { |
| fHitEnd = TRUE; |
| } |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| |
| case URX_STRING_I: |
| // Case-insensitive test input against a literal string. |
| // Strings require two slots in the compiled pattern, one for the |
| // offset to the string text, and one for the length. |
| // The compiled string has already been case folded. |
| { |
| const UChar *patternString = litText + opValue; |
| |
| op = (int32_t)pat[fp->fPatIdx]; |
| fp->fPatIdx++; |
| opType = URX_TYPE(op); |
| opValue = URX_VAL(op); |
| U_ASSERT(opType == URX_STRING_LEN); |
| int32_t patternStringLen = opValue; // Length of the string from the pattern. |
| |
| UChar32 cText; |
| UChar32 cPattern; |
| UBool success = TRUE; |
| int32_t patternStringIdx = 0; |
| CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); |
| while (patternStringIdx < patternStringLen) { |
| U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); |
| cText = inputIterator.next(); |
| if (cText != cPattern) { |
| success = FALSE; |
| if (cText == U_SENTINEL) { |
| fHitEnd = TRUE; |
| } |
| break; |
| } |
| } |
| if (inputIterator.inExpansion()) { |
| success = FALSE; |
| } |
| |
| if (success) { |
| fp->fInputIdx = inputIterator.getIndex(); |
| } else { |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| } |
| break; |
| |
| case URX_LB_START: |
| { |
| // Entering a look-behind block. |
| // Save Stack Ptr, Input Pos. |
| // TODO: implement transparent bounds. Ticket #6067 |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| fData[opValue] = fStack->size(); |
| fData[opValue+1] = fp->fInputIdx; |
| // Init the variable containing the start index for attempted matches. |
| fData[opValue+2] = -1; |
| // Save input string length, then reset to pin any matches to end at |
| // the current position. |
| fData[opValue+3] = fActiveLimit; |
| fActiveLimit = fp->fInputIdx; |
| } |
| break; |
| |
| |
| case URX_LB_CONT: |
| { |
| // Positive Look-Behind, at top of loop checking for matches of LB expression |
| // at all possible input starting positions. |
| |
| // Fetch the min and max possible match lengths. They are the operands |
| // of this op in the pattern. |
| int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| U_ASSERT(minML <= maxML); |
| U_ASSERT(minML >= 0); |
| |
| // Fetch (from data) the last input index where a match was attempted. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int64_t *lbStartIdx = &fData[opValue+2]; |
| if (*lbStartIdx < 0) { |
| // First time through loop. |
| *lbStartIdx = fp->fInputIdx - minML; |
| } else { |
| // 2nd through nth time through the loop. |
| // Back up start position for match by one. |
| if (*lbStartIdx == 0) { |
| (*lbStartIdx)--; |
| } else { |
| U16_BACK_1(inputBuf, 0, *lbStartIdx); |
| } |
| } |
| |
| if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| // We have tried all potential match starting points without |
| // getting a match. Backtrack out, and out of the |
| // Look Behind altogether. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| int64_t restoreInputLen = fData[opValue+3]; |
| U_ASSERT(restoreInputLen >= fActiveLimit); |
| U_ASSERT(restoreInputLen <= fInputLength); |
| fActiveLimit = restoreInputLen; |
| break; |
| } |
| |
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
| // (successful match will fall off the end of the loop.) |
| fp = StateSave(fp, fp->fPatIdx-3, status); |
| fp->fInputIdx = *lbStartIdx; |
| } |
| break; |
| |
| case URX_LB_END: |
| // End of a look-behind block, after a successful match. |
| { |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| if (fp->fInputIdx != fActiveLimit) { |
| // The look-behind expression matched, but the match did not |
| // extend all the way to the point that we are looking behind from. |
| // FAIL out of here, which will take us back to the LB_CONT, which |
| // will retry the match starting at another position or fail |
| // the look-behind altogether, whichever is appropriate. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // Look-behind match is good. Restore the orignal input string length, |
| // which had been truncated to pin the end of the lookbehind match to the |
| // position being looked-behind. |
| int64_t originalInputLen = fData[opValue+3]; |
| U_ASSERT(originalInputLen >= fActiveLimit); |
| U_ASSERT(originalInputLen <= fInputLength); |
| fActiveLimit = originalInputLen; |
| } |
| break; |
| |
| |
| case URX_LBN_CONT: |
| { |
| // Negative Look-Behind, at top of loop checking for matches of LB expression |
| // at all possible input starting positions. |
| |
| // Fetch the extra parameters of this op. |
| int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
| int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
| int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; |
| continueLoc = URX_VAL(continueLoc); |
| U_ASSERT(minML <= maxML); |
| U_ASSERT(minML >= 0); |
| U_ASSERT(continueLoc > fp->fPatIdx); |
| |
| // Fetch (from data) the last input index where a match was attempted. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int64_t *lbStartIdx = &fData[opValue+2]; |
| if (*lbStartIdx < 0) { |
| // First time through loop. |
| *lbStartIdx = fp->fInputIdx - minML; |
| } else { |
| // 2nd through nth time through the loop. |
| // Back up start position for match by one. |
| if (*lbStartIdx == 0) { |
| (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. |
| } else { |
| U16_BACK_1(inputBuf, 0, *lbStartIdx); |
| } |
| } |
| |
| if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
| // We have tried all potential match starting points without |
| // getting a match, which means that the negative lookbehind as |
| // a whole has succeeded. Jump forward to the continue location |
| int64_t restoreInputLen = fData[opValue+3]; |
| U_ASSERT(restoreInputLen >= fActiveLimit); |
| U_ASSERT(restoreInputLen <= fInputLength); |
| fActiveLimit = restoreInputLen; |
| fp->fPatIdx = continueLoc; |
| break; |
| } |
| |
| // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
| // (successful match will cause a FAIL out of the loop altogether.) |
| fp = StateSave(fp, fp->fPatIdx-4, status); |
| fp->fInputIdx = *lbStartIdx; |
| } |
| break; |
| |
| case URX_LBN_END: |
| // End of a negative look-behind block, after a successful match. |
| { |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| if (fp->fInputIdx != fActiveLimit) { |
| // The look-behind expression matched, but the match did not |
| // extend all the way to the point that we are looking behind from. |
| // FAIL out of here, which will take us back to the LB_CONT, which |
| // will retry the match starting at another position or succeed |
| // the look-behind altogether, whichever is appropriate. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| break; |
| } |
| |
| // Look-behind expression matched, which means look-behind test as |
| // a whole Fails |
| |
| // Restore the orignal input string length, which had been truncated |
| // inorder to pin the end of the lookbehind match |
| // to the position being looked-behind. |
| int64_t originalInputLen = fData[opValue+3]; |
| U_ASSERT(originalInputLen >= fActiveLimit); |
| U_ASSERT(originalInputLen <= fInputLength); |
| fActiveLimit = originalInputLen; |
| |
| // Restore original stack position, discarding any state saved |
| // by the successful pattern match. |
| U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
| int32_t newStackSize = (int32_t)fData[opValue]; |
| U_ASSERT(fStack->size() > newStackSize); |
| fStack->setSize(newStackSize); |
| |
| // FAIL, which will take control back to someplace |
| // prior to entering the look-behind test. |
| fp = (REStackFrame *)fStack->popFrame(fFrameSize); |
| } |
| break; |
| |
| |
| case URX_LOOP_SR_I: |
| // Loop Initialization for the optimized implementation of |
| // [some character set]* |
| // This op scans through all matching input. |
| // The following LOOP_C op emulates stack unwinding if the following pattern fails. |
| { |
| U_ASSERT(opValue > 0 && opValue < sets->size()); |
| Regex8BitSet *s8 = &fPattern->fSets8[opValue]; |
| UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); |
| |
| // Loop through input, until either the input is exhausted or |
| // we reach a character that is not a member of the set. |
| int32_t ix = (int32_t)fp->fInputIdx; |
| for (;;) { |
| if (ix >= fActiveLimit) { |
| fHitEnd = TRUE; |
| break; |
| } |
| UChar32 c; |
| U16_NEXT(inputBuf, ix, fActiveLimit, c); |
| if (c<256) { |
| if (s8->contains(c) == FALSE) { |
| U16_BACK_1(inputBuf, 0, ix); |
| break; |
| } |
| } else { |
| if (s->contains(c) == FALSE) { |
| U16_BACK_1(inputBuf, 0, ix); |
| break; |
| } |
| } |
| } |
| |
| // If there were no matching characters, skip over the loop altogether. |
| // The loop doesn't run at all, a * op always succeeds. |
| if (ix == fp->fInputIdx) { |
| fp->fPatIdx++; // skip the URX_LOOP_C op. |
| break; |
| } |
| |
| // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| // must follow. It's operand is the stack location |
| // that holds the starting input index for the match of this [set]* |
| int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| int32_t stackLoc = URX_VAL(loopcOp); |
| U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| fp->fExtra[stackLoc] = fp->fInputIdx; |
| fp->fInputIdx = ix; |
| |
| // Save State to the URX_LOOP_C op that follows this one, |
| // so that match failures in the following code will return to there. |
| // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. |
| fp = StateSave(fp, fp->fPatIdx, status); |
| fp->fPatIdx++; |
| } |
| break; |
| |
| |
| case URX_LOOP_DOT_I: |
| // Loop Initialization for the optimized implementation of .* |
| // This op scans through all remaining input. |
| // The following LOOP_C op emulates stack unwinding if the following pattern fails. |
| { |
| // Loop through input until the input is exhausted (we reach an end-of-line) |
| // In DOTALL mode, we can just go straight to the end of the input. |
| int32_t ix; |
| if ((opValue & 1) == 1) { |
| // Dot-matches-All mode. Jump straight to the end of the string. |
| ix = (int32_t)fActiveLimit; |
| fHitEnd = TRUE; |
| } else { |
| // NOT DOT ALL mode. Line endings do not match '.' |
| // Scan forward until a line ending or end of input. |
| ix = (int32_t)fp->fInputIdx; |
| for (;;) { |
| if (ix >= fActiveLimit) { |
| fHitEnd = TRUE; |
| break; |
| } |
| UChar32 c; |
| U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] |
| if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s |
| if ((c == 0x0a) || // 0x0a is newline in both modes. |
| (((opValue & 2) == 0) && // IF not UNIX_LINES mode |
| isLineTerminator(c))) { |
| // char is a line ending. Put the input pos back to the |
| // line ending char, and exit the scanning loop. |
| U16_BACK_1(inputBuf, 0, ix); |
| break; |
| } |
| } |
| } |
| } |
| |
| // If there were no matching characters, skip over the loop altogether. |
| // The loop doesn't run at all, a * op always succeeds. |
| if (ix == fp->fInputIdx) { |
| fp->fPatIdx++; // skip the URX_LOOP_C op. |
| break; |
| } |
| |
| // Peek ahead in the compiled pattern, to the URX_LOOP_C that |
| // must follow. It's operand is the stack location |
| // that holds the starting input index for the match of this .* |
| int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; |
| U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); |
| int32_t stackLoc = URX_VAL(loopcOp); |
| U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); |
| fp->fExtra[stackLoc] = fp->fInputIdx; |
| fp->fInputIdx = ix; |
| |
| // Save State to the URX_LOOP_C op that follows this one, |
| // so that match failures in the following code will return to there. |
| // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. |
| fp = StateSave(fp, fp->fPatIdx, status); |
| fp->fPatIdx++; |
| } |
| break; |
| |
| |
| case URX_LOOP_C: |
| { |
| U_ASSERT(opValue>=0 && opValue<fFrameSize); |
| backSearchIndex = (int32_t)fp->fExtra[opValue]; |
| U_ASSERT(backSearchIndex <= fp->fInputIdx); |
| if (backSearchIndex == fp->fInputIdx) { |
| // We've backed up the input idx to the point that the loop started. |
| // The loop is done. Leave here without saving state. |
| // Subsequent failures won't come back here. |
| break; |
| } |
| // Set up for the next iteration of the loop, with input index |
| // backed up by one from the last time through, |
| // and a state save to this instruction in case the following code fails again. |
| // (We're going backwards because this loop emulates stack unwinding, not |
| // the initial scan forward.) |
| U_ASSERT(fp->fInputIdx > 0); |
| UChar32 prevC; |
| U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? |
| |
| if (prevC == 0x0a && |
| fp->fInputIdx > backSearchIndex && |
| inputBuf[fp->fInputIdx-1] == 0x0d) { |
| int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; |
| if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { |
| // .*, stepping back over CRLF pair. |
| U16_BACK_1(inputBuf, 0, fp->fInputIdx); |
| } |
| } |
| |
| |
| fp = StateSave(fp, fp->fPatIdx-1, status); |
| } |
| break; |
| |
| |
| |
| default: |
| // Trouble. The compiled pattern contains an entry with an |
| // unrecognized type tag. |
| U_ASSERT(FALSE); |
| } |
| |
| if (U_FAILURE(status)) { |
| isMatch = FALSE; |
| break; |
| } |
| } |
| |
| breakFromLoop: |
| fMatch = isMatch; |
| if (isMatch) { |
| fLastMatchEnd = fMatchEnd; |
| fMatchStart = startIdx; |
| fMatchEnd = fp->fInputIdx; |
| } |
| |
| #ifdef REGEX_RUN_DEBUG |
| if (fTraceDebug) { |
| if (isMatch) { |
| printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); |
| } else { |
| printf("No match\n\n"); |
| } |
| } |
| #endif |
| |
| fFrame = fp; // The active stack frame when the engine stopped. |
| // Contains the capture group results that we need to |
| // access later. |
| |
| return; |
| } |
| |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) |
| |
| U_NAMESPACE_END |
| |
| #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |