| # Copyright (C) 2016 and later: Unicode, Inc. and others. |
| # License & terms of use: http://www.unicode.org/copyright.html |
| #***************************************************************************** |
| # |
| # Copyright (C) 2002-2015, International Business Machines Corporation and others. |
| # All Rights Reserved. |
| # |
| #***************************************************************************** |
| # |
| # file: regexcst.txt |
| # ICU Regular Expression Parser State Table |
| # |
| # This state table is used when reading and parsing a regular expression pattern |
| # The pattern parser uses a state machine; the data in this file define the |
| # state transitions that occur for each input character. |
| # |
| # *** This file defines the regex pattern grammar. This is it. |
| # *** The determination of what is accepted is here. |
| # |
| # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays |
| # that are then built with the rule parser. |
| # |
| |
| # |
| # Here is the syntax of the state definitions in this file: |
| # |
| # |
| #StateName: |
| # input-char n next-state ^push-state action |
| # input-char n next-state ^push-state action |
| # | | | | | |
| # | | | | |--- action to be performed by state machine |
| # | | | | See function RBBIRuleScanner::doParseActions() |
| # | | | | |
| # | | | |--- Push this named state onto the state stack. |
| # | | | Later, when next state is specified as "pop", |
| # | | | the pushed state will become the current state. |
| # | | | |
| # | | |--- Transition to this state if the current input character matches the input |
| # | | character or char class in the left hand column. "pop" causes the next |
| # | | state to be popped from the state stack. |
| # | | |
| # | |--- When making the state transition specified on this line, advance to the next |
| # | character from the input only if 'n' appears here. |
| # | |
| # |--- Character or named character classes to test for. If the current character being scanned |
| # matches, peform the actions and go to the state specified on this line. |
| # The input character is tested sequentally, in the order written. The characters and |
| # character classes tested for do not need to be mutually exclusive. The first match wins. |
| # |
| |
| |
| |
| |
| # |
| # start state, scan position is at the beginning of the pattern. |
| # |
| start: |
| default term doPatStart |
| |
| |
| |
| |
| # |
| # term. At a position where we can accept the start most items in a pattern. |
| # |
| term: |
| quoted n expr-quant doLiteralChar |
| rule_char n expr-quant doLiteralChar |
| '[' n set-open ^set-finish doSetBegin |
| '(' n open-paren |
| '.' n expr-quant doDotAny |
| '^' n expr-quant doCaret |
| '$' n expr-quant doDollar |
| '\' n backslash |
| '|' n term doOrOperator |
| ')' n pop doCloseParen |
| eof term doPatFinish |
| default errorDeath doRuleError |
| |
| |
| |
| # |
| # expr-quant We've just finished scanning a term, now look for the optional |
| # trailing quantifier - *, +, ?, *?, etc. |
| # |
| expr-quant: |
| '*' n quant-star |
| '+' n quant-plus |
| '?' n quant-opt |
| '{' n interval-open doIntervalInit |
| '(' n open-paren-quant |
| default expr-cont |
| |
| |
| # |
| # expr-cont Expression, continuation. At a point where additional terms are |
| # allowed, but not required. No Quantifiers |
| # |
| expr-cont: |
| '|' n term doOrOperator |
| ')' n pop doCloseParen |
| default term |
| |
| |
| # |
| # open-paren-quant Special case handling for comments appearing before a quantifier, |
| # e.g. x(?#comment )* |
| # Open parens from expr-quant come here; anything but a (?# comment |
| # branches into the normal parenthesis sequence as quickly as possible. |
| # |
| open-paren-quant: |
| '?' n open-paren-quant2 doSuppressComments |
| default open-paren |
| |
| open-paren-quant2: |
| '#' n paren-comment ^expr-quant |
| default open-paren-extended |
| |
| |
| # |
| # open-paren We've got an open paren. We need to scan further to |
| # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. |
| # |
| open-paren: |
| '?' n open-paren-extended doSuppressComments |
| default term ^expr-quant doOpenCaptureParen |
| |
| open-paren-extended: |
| ':' n term ^expr-quant doOpenNonCaptureParen # (?: |
| '>' n term ^expr-quant doOpenAtomicParen # (?> |
| '=' n term ^expr-cont doOpenLookAhead # (?= |
| '!' n term ^expr-cont doOpenLookAheadNeg # (?! |
| '<' n open-paren-lookbehind |
| '#' n paren-comment ^term |
| 'i' paren-flag doBeginMatchMode |
| 'd' paren-flag doBeginMatchMode |
| 'm' paren-flag doBeginMatchMode |
| 's' paren-flag doBeginMatchMode |
| 'u' paren-flag doBeginMatchMode |
| 'w' paren-flag doBeginMatchMode |
| 'x' paren-flag doBeginMatchMode |
| '-' paren-flag doBeginMatchMode |
| '(' n errorDeath doConditionalExpr |
| '{' n errorDeath doPerlInline |
| default errorDeath doBadOpenParenType |
| |
| open-paren-lookbehind: |
| '=' n term ^expr-cont doOpenLookBehind # (?<= |
| '!' n term ^expr-cont doOpenLookBehindNeg # (?<! |
| ascii_letter named-capture doBeginNamedCapture # (?<name |
| default errorDeath doBadOpenParenType |
| |
| |
| # |
| # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' |
| # |
| paren-comment: |
| ')' n pop |
| eof errorDeath doMismatchedParenErr |
| default n paren-comment |
| |
| # |
| # paren-flag Scanned a (?ismx-ismx flag setting |
| # |
| paren-flag: |
| 'i' n paren-flag doMatchMode |
| 'd' n paren-flag doMatchMode |
| 'm' n paren-flag doMatchMode |
| 's' n paren-flag doMatchMode |
| 'u' n paren-flag doMatchMode |
| 'w' n paren-flag doMatchMode |
| 'x' n paren-flag doMatchMode |
| '-' n paren-flag doMatchMode |
| ')' n term doSetMatchMode |
| ':' n term ^expr-quant doMatchModeParen |
| default errorDeath doBadModeFlag |
| |
| # |
| # named-capture (?<name> ... ), position currently on the name. |
| # |
| named-capture: |
| ascii_letter n named-capture doContinueNamedCapture |
| digit_char n named-capture doContinueNamedCapture |
| '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. |
| default errorDeath doBadNamedCapture |
| |
| # |
| # quant-star Scanning a '*' quantifier. Need to look ahead to decide |
| # between plain '*', '*?', '*+' |
| # |
| quant-star: |
| '?' n expr-cont doNGStar # *? |
| '+' n expr-cont doPossessiveStar # *+ |
| default expr-cont doStar |
| |
| |
| # |
| # quant-plus Scanning a '+' quantifier. Need to look ahead to decide |
| # between plain '+', '+?', '++' |
| # |
| quant-plus: |
| '?' n expr-cont doNGPlus # *? |
| '+' n expr-cont doPossessivePlus # *+ |
| default expr-cont doPlus |
| |
| |
| # |
| # quant-opt Scanning a '?' quantifier. Need to look ahead to decide |
| # between plain '?', '??', '?+' |
| # |
| quant-opt: |
| '?' n expr-cont doNGOpt # ?? |
| '+' n expr-cont doPossessiveOpt # ?+ |
| default expr-cont doOpt # ? |
| |
| |
| # |
| # Interval scanning a '{', the opening delimiter for an interval specification |
| # {number} or {min, max} or {min,} |
| # |
| interval-open: |
| digit_char interval-lower |
| default errorDeath doIntervalError |
| |
| interval-lower: |
| digit_char n interval-lower doIntevalLowerDigit |
| ',' n interval-upper |
| '}' n interval-type doIntervalSame # {n} |
| default errorDeath doIntervalError |
| |
| interval-upper: |
| digit_char n interval-upper doIntervalUpperDigit |
| '}' n interval-type |
| default errorDeath doIntervalError |
| |
| interval-type: |
| '?' n expr-cont doNGInterval # {n,m}? |
| '+' n expr-cont doPossessiveInterval # {n,m}+ |
| default expr-cont doInterval # {m,n} |
| |
| |
| # |
| # backslash # Backslash. Figure out which of the \thingies we have encountered. |
| # The low level next-char function will have preprocessed |
| # some of them already; those won't come here. |
| backslash: |
| 'A' n term doBackslashA |
| 'B' n term doBackslashB |
| 'b' n term doBackslashb |
| 'd' n expr-quant doBackslashd |
| 'D' n expr-quant doBackslashD |
| 'G' n term doBackslashG |
| 'h' n expr-quant doBackslashh |
| 'H' n expr-quant doBackslashH |
| 'k' n named-backref |
| 'N' expr-quant doNamedChar # \N{NAME} named char |
| 'p' expr-quant doProperty # \p{Lu} style property |
| 'P' expr-quant doProperty |
| 'R' n expr-quant doBackslashR |
| 'Q' n term doEnterQuoteMode |
| 'S' n expr-quant doBackslashS |
| 's' n expr-quant doBackslashs |
| 'v' n expr-quant doBackslashv |
| 'V' n expr-quant doBackslashV |
| 'W' n expr-quant doBackslashW |
| 'w' n expr-quant doBackslashw |
| 'X' n expr-quant doBackslashX |
| 'Z' n term doBackslashZ |
| 'z' n term doBackslashz |
| digit_char n expr-quant doBackRef # Will scan multiple digits |
| eof errorDeath doEscapeError |
| default n expr-quant doEscapedLiteralChar |
| |
| |
| # named-backref Scanned \k |
| # Leading to \k<captureName> |
| # Failure to get the full sequence is an error. |
| # |
| named-backref: |
| '<' n named-backref-2 doBeginNamedBackRef |
| default errorDeath doBadNamedCapture |
| |
| named-backref-2: |
| ascii_letter n named-backref-3 doContinueNamedBackRef |
| default errorDeath doBadNamedCapture |
| |
| named-backref-3: |
| ascii_letter n named-backref-3 doContinueNamedBackRef |
| digit_char n named-backref-3 doContinueNamedBackRef |
| '>' n expr-quant doCompleteNamedBackRef |
| default errorDeath doBadNamedCapture |
| |
| |
| # |
| # [set expression] parsing, |
| # All states involved in parsing set expressions have names beginning with "set-" |
| # |
| |
| set-open: |
| '^' n set-open2 doSetNegate |
| ':' set-posix doSetPosixProp |
| default set-open2 |
| |
| set-open2: |
| ']' n set-after-lit doSetLiteral |
| default set-start |
| |
| # set-posix: |
| # scanned a '[:' If it really is a [:property:], doSetPosixProp will have |
| # moved the scan to the closing ']'. If it wasn't a property |
| # expression, the scan will still be at the opening ':', which should |
| # be interpreted as a normal set expression. |
| set-posix: |
| ']' n pop doSetEnd |
| ':' set-start |
| default errorDeath doRuleError # should not be possible. |
| |
| # |
| # set-start after the [ and special case leading characters (^ and/or ]) but before |
| # everything else. A '-' is literal at this point. |
| # |
| set-start: |
| ']' n pop doSetEnd |
| '[' n set-open ^set-after-set doSetBeginUnion |
| '\' n set-escape |
| '-' n set-start-dash |
| '&' n set-start-amp |
| default n set-after-lit doSetLiteral |
| |
| # set-start-dash Turn "[--" into a syntax error. |
| # "[-x" is good, - and x are literals. |
| # |
| set-start-dash: |
| '-' errorDeath doRuleError |
| default set-after-lit doSetAddDash |
| |
| # set-start-amp Turn "[&&" into a syntax error. |
| # "[&x" is good, & and x are literals. |
| # |
| set-start-amp: |
| '&' errorDeath doRuleError |
| default set-after-lit doSetAddAmp |
| |
| # |
| # set-after-lit The last thing scanned was a literal character within a set. |
| # Can be followed by anything. Single '-' or '&' are |
| # literals in this context, not operators. |
| set-after-lit: |
| ']' n pop doSetEnd |
| '[' n set-open ^set-after-set doSetBeginUnion |
| '-' n set-lit-dash |
| '&' n set-lit-amp |
| '\' n set-escape |
| eof errorDeath doSetNoCloseError |
| default n set-after-lit doSetLiteral |
| |
| set-after-set: |
| ']' n pop doSetEnd |
| '[' n set-open ^set-after-set doSetBeginUnion |
| '-' n set-set-dash |
| '&' n set-set-amp |
| '\' n set-escape |
| eof errorDeath doSetNoCloseError |
| default n set-after-lit doSetLiteral |
| |
| set-after-range: |
| ']' n pop doSetEnd |
| '[' n set-open ^set-after-set doSetBeginUnion |
| '-' n set-range-dash |
| '&' n set-range-amp |
| '\' n set-escape |
| eof errorDeath doSetNoCloseError |
| default n set-after-lit doSetLiteral |
| |
| |
| # set-after-op |
| # After a -- or && |
| # It is an error to close a set at this point. |
| # |
| set-after-op: |
| '[' n set-open ^set-after-set doSetBeginUnion |
| ']' errorDeath doSetOpError |
| '\' n set-escape |
| default n set-after-lit doSetLiteral |
| |
| # |
| # set-set-amp |
| # Have scanned [[set]& |
| # Could be a '&' intersection operator, if a set follows. |
| # Could be the start of a '&&' operator. |
| # Otherewise is a literal. |
| set-set-amp: |
| '[' n set-open ^set-after-set doSetBeginIntersection1 |
| '&' n set-after-op doSetIntersection2 |
| default set-after-lit doSetAddAmp |
| |
| |
| # set-lit-amp Have scanned "[literals&" |
| # Could be a start of "&&" operator or a literal |
| # In [abc&[def]], the '&' is a literal |
| # |
| set-lit-amp: |
| '&' n set-after-op doSetIntersection2 |
| default set-after-lit doSetAddAmp |
| |
| |
| # |
| # set-set-dash |
| # Have scanned [set]- |
| # Could be a '-' difference operator, if a [set] follows. |
| # Could be the start of a '--' operator. |
| # Otherewise is a literal. |
| set-set-dash: |
| '[' n set-open ^set-after-set doSetBeginDifference1 |
| '-' n set-after-op doSetDifference2 |
| default set-after-lit doSetAddDash |
| |
| |
| # |
| # set-range-dash |
| # scanned a-b- or \w- |
| # any set or range like item where the trailing single '-' should |
| # be literal, not a set difference operation. |
| # A trailing "--" is still a difference operator. |
| set-range-dash: |
| '-' n set-after-op doSetDifference2 |
| default set-after-lit doSetAddDash |
| |
| |
| set-range-amp: |
| '&' n set-after-op doSetIntersection2 |
| default set-after-lit doSetAddAmp |
| |
| |
| # set-lit-dash |
| # Have scanned "[literals-" Could be a range or a -- operator or a literal |
| # In [abc-[def]], the '-' is a literal (confirmed with a Java test) |
| # [abc-\p{xx} the '-' is an error |
| # [abc-] the '-' is a literal |
| # [ab-xy] the '-' is a range |
| # |
| set-lit-dash: |
| '-' n set-after-op doSetDifference2 |
| '[' set-after-lit doSetAddDash |
| ']' set-after-lit doSetAddDash |
| '\' n set-lit-dash-escape |
| default n set-after-range doSetRange |
| |
| # set-lit-dash-escape |
| # |
| # scanned "[literal-\" |
| # Could be a range, if the \ introduces an escaped literal char or a named char. |
| # Otherwise it is an error. |
| # |
| set-lit-dash-escape: |
| 's' errorDeath doSetOpError |
| 'S' errorDeath doSetOpError |
| 'w' errorDeath doSetOpError |
| 'W' errorDeath doSetOpError |
| 'd' errorDeath doSetOpError |
| 'D' errorDeath doSetOpError |
| 'N' set-after-range doSetNamedRange |
| default n set-after-range doSetRange |
| |
| |
| # |
| # set-escape |
| # Common back-slash escape processing within set expressions |
| # |
| set-escape: |
| 'p' set-after-set doSetProp |
| 'P' set-after-set doSetProp |
| 'N' set-after-lit doSetNamedChar |
| 's' n set-after-range doSetBackslash_s |
| 'S' n set-after-range doSetBackslash_S |
| 'w' n set-after-range doSetBackslash_w |
| 'W' n set-after-range doSetBackslash_W |
| 'd' n set-after-range doSetBackslash_d |
| 'D' n set-after-range doSetBackslash_D |
| 'h' n set-after-range doSetBackslash_h |
| 'H' n set-after-range doSetBackslash_H |
| 'v' n set-after-range doSetBackslash_v |
| 'V' n set-after-range doSetBackslash_V |
| default n set-after-lit doSetLiteralEscaped |
| |
| # |
| # set-finish |
| # Have just encountered the final ']' that completes a [set], and |
| # arrived here via a pop. From here, we exit the set parsing world, and go |
| # back to generic regular expression parsing. |
| # |
| set-finish: |
| default expr-quant doSetFinish |
| |
| |
| # |
| # errorDeath. This state is specified as the next state whenever a syntax error |
| # in the source rules is detected. Barring bugs, the state machine will never |
| # actually get here, but will stop because of the action associated with the error. |
| # But, just in case, this state asks the state machine to exit. |
| errorDeath: |
| default n errorDeath doExit |
| |
| |