blob: d69a7ea36935b5b7f716b4e5d64170a57d095919 [file] [log] [blame]
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#*****************************************************************************
#
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
#
# file: regexcst.txt
# ICU Regular Expression Parser State Table
#
# This state table is used when reading and parsing a regular expression pattern
# The pattern parser uses a state machine; the data in this file define the
# state transitions that occur for each input character.
#
# *** This file defines the regex pattern grammar. This is it.
# *** The determination of what is accepted is here.
#
# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
# that are then built with the rule parser.
#
#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
# input-char n next-state ^push-state action
# input-char n next-state ^push-state action
# | | | | |
# | | | | |--- action to be performed by state machine
# | | | | See function RBBIRuleScanner::doParseActions()
# | | | |
# | | | |--- Push this named state onto the state stack.
# | | | Later, when next state is specified as "pop",
# | | | the pushed state will become the current state.
# | | |
# | | |--- Transition to this state if the current input character matches the input
# | | character or char class in the left hand column. "pop" causes the next
# | | state to be popped from the state stack.
# | |
# | |--- When making the state transition specified on this line, advance to the next
# | character from the input only if 'n' appears here.
# |
# |--- Character or named character classes to test for. If the current character being scanned
# matches, peform the actions and go to the state specified on this line.
# The input character is tested sequentally, in the order written. The characters and
# character classes tested for do not need to be mutually exclusive. The first match wins.
#
#
# start state, scan position is at the beginning of the pattern.
#
start:
default term doPatStart
#
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
'[' n set-open ^set-finish doSetBegin
'(' n open-paren
'.' n expr-quant doDotAny
'^' n expr-quant doCaret
'$' n expr-quant doDollar
'\' n backslash
'|' n term doOrOperator
')' n pop doCloseParen
eof term doPatFinish
default errorDeath doRuleError
#
# expr-quant We've just finished scanning a term, now look for the optional
# trailing quantifier - *, +, ?, *?, etc.
#
expr-quant:
'*' n quant-star
'+' n quant-plus
'?' n quant-opt
'{' n interval-open doIntervalInit
'(' n open-paren-quant
default expr-cont
#
# expr-cont Expression, continuation. At a point where additional terms are
# allowed, but not required. No Quantifiers
#
expr-cont:
'|' n term doOrOperator
')' n pop doCloseParen
default term
#
# open-paren-quant Special case handling for comments appearing before a quantifier,
# e.g. x(?#comment )*
# Open parens from expr-quant come here; anything but a (?# comment
# branches into the normal parenthesis sequence as quickly as possible.
#
open-paren-quant:
'?' n open-paren-quant2 doSuppressComments
default open-paren
open-paren-quant2:
'#' n paren-comment ^expr-quant
default open-paren-extended
#
# open-paren We've got an open paren. We need to scan further to
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
#
open-paren:
'?' n open-paren-extended doSuppressComments
default term ^expr-quant doOpenCaptureParen
open-paren-extended:
':' n term ^expr-quant doOpenNonCaptureParen # (?:
'>' n term ^expr-quant doOpenAtomicParen # (?>
'=' n term ^expr-cont doOpenLookAhead # (?=
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
'<' n open-paren-lookbehind
'#' n paren-comment ^term
'i' paren-flag doBeginMatchMode
'd' paren-flag doBeginMatchMode
'm' paren-flag doBeginMatchMode
's' paren-flag doBeginMatchMode
'u' paren-flag doBeginMatchMode
'w' paren-flag doBeginMatchMode
'x' paren-flag doBeginMatchMode
'-' paren-flag doBeginMatchMode
'(' n errorDeath doConditionalExpr
'{' n errorDeath doPerlInline
default errorDeath doBadOpenParenType
open-paren-lookbehind:
'=' n term ^expr-cont doOpenLookBehind # (?<=
'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
ascii_letter named-capture doBeginNamedCapture # (?<name
default errorDeath doBadOpenParenType
#
# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
#
paren-comment:
')' n pop
eof errorDeath doMismatchedParenErr
default n paren-comment
#
# paren-flag Scanned a (?ismx-ismx flag setting
#
paren-flag:
'i' n paren-flag doMatchMode
'd' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
's' n paren-flag doMatchMode
'u' n paren-flag doMatchMode
'w' n paren-flag doMatchMode
'x' n paren-flag doMatchMode
'-' n paren-flag doMatchMode
')' n term doSetMatchMode
':' n term ^expr-quant doMatchModeParen
default errorDeath doBadModeFlag
#
# named-capture (?<name> ... ), position currently on the name.
#
named-capture:
ascii_letter n named-capture doContinueNamedCapture
digit_char n named-capture doContinueNamedCapture
'>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
default errorDeath doBadNamedCapture
#
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
# between plain '*', '*?', '*+'
#
quant-star:
'?' n expr-cont doNGStar # *?
'+' n expr-cont doPossessiveStar # *+
default expr-cont doStar
#
# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
# between plain '+', '+?', '++'
#
quant-plus:
'?' n expr-cont doNGPlus # *?
'+' n expr-cont doPossessivePlus # *+
default expr-cont doPlus
#
# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
# between plain '?', '??', '?+'
#
quant-opt:
'?' n expr-cont doNGOpt # ??
'+' n expr-cont doPossessiveOpt # ?+
default expr-cont doOpt # ?
#
# Interval scanning a '{', the opening delimiter for an interval specification
# {number} or {min, max} or {min,}
#
interval-open:
digit_char interval-lower
default errorDeath doIntervalError
interval-lower:
digit_char n interval-lower doIntevalLowerDigit
',' n interval-upper
'}' n interval-type doIntervalSame # {n}
default errorDeath doIntervalError
interval-upper:
digit_char n interval-upper doIntervalUpperDigit
'}' n interval-type
default errorDeath doIntervalError
interval-type:
'?' n expr-cont doNGInterval # {n,m}?
'+' n expr-cont doPossessiveInterval # {n,m}+
default expr-cont doInterval # {m,n}
#
# backslash # Backslash. Figure out which of the \thingies we have encountered.
# The low level next-char function will have preprocessed
# some of them already; those won't come here.
backslash:
'A' n term doBackslashA
'B' n term doBackslashB
'b' n term doBackslashb
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'h' n expr-quant doBackslashh
'H' n expr-quant doBackslashH
'k' n named-backref
'N' expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'R' n expr-quant doBackslashR
'Q' n term doEnterQuoteMode
'S' n expr-quant doBackslashS
's' n expr-quant doBackslashs
'v' n expr-quant doBackslashv
'V' n expr-quant doBackslashV
'W' n expr-quant doBackslashW
'w' n expr-quant doBackslashw
'X' n expr-quant doBackslashX
'Z' n term doBackslashZ
'z' n term doBackslashz
digit_char n expr-quant doBackRef # Will scan multiple digits
eof errorDeath doEscapeError
default n expr-quant doEscapedLiteralChar
# named-backref Scanned \k
# Leading to \k<captureName>
# Failure to get the full sequence is an error.
#
named-backref:
'<' n named-backref-2 doBeginNamedBackRef
default errorDeath doBadNamedCapture
named-backref-2:
ascii_letter n named-backref-3 doContinueNamedBackRef
default errorDeath doBadNamedCapture
named-backref-3:
ascii_letter n named-backref-3 doContinueNamedBackRef
digit_char n named-backref-3 doContinueNamedBackRef
'>' n expr-quant doCompleteNamedBackRef
default errorDeath doBadNamedCapture
#
# [set expression] parsing,
# All states involved in parsing set expressions have names beginning with "set-"
#
set-open:
'^' n set-open2 doSetNegate
':' set-posix doSetPosixProp
default set-open2
set-open2:
']' n set-after-lit doSetLiteral
default set-start
# set-posix:
# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
# moved the scan to the closing ']'. If it wasn't a property
# expression, the scan will still be at the opening ':', which should
# be interpreted as a normal set expression.
set-posix:
']' n pop doSetEnd
':' set-start
default errorDeath doRuleError # should not be possible.
#
# set-start after the [ and special case leading characters (^ and/or ]) but before
# everything else. A '-' is literal at this point.
#
set-start:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'\' n set-escape
'-' n set-start-dash
'&' n set-start-amp
default n set-after-lit doSetLiteral
# set-start-dash Turn "[--" into a syntax error.
# "[-x" is good, - and x are literals.
#
set-start-dash:
'-' errorDeath doRuleError
default set-after-lit doSetAddDash
# set-start-amp Turn "[&&" into a syntax error.
# "[&x" is good, & and x are literals.
#
set-start-amp:
'&' errorDeath doRuleError
default set-after-lit doSetAddAmp
#
# set-after-lit The last thing scanned was a literal character within a set.
# Can be followed by anything. Single '-' or '&' are
# literals in this context, not operators.
set-after-lit:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-lit-dash
'&' n set-lit-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
set-after-set:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-set-dash
'&' n set-set-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
set-after-range:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-range-dash
'&' n set-range-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
# set-after-op
# After a -- or &&
# It is an error to close a set at this point.
#
set-after-op:
'[' n set-open ^set-after-set doSetBeginUnion
']' errorDeath doSetOpError
'\' n set-escape
default n set-after-lit doSetLiteral
#
# set-set-amp
# Have scanned [[set]&
# Could be a '&' intersection operator, if a set follows.
# Could be the start of a '&&' operator.
# Otherewise is a literal.
set-set-amp:
'[' n set-open ^set-after-set doSetBeginIntersection1
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
# set-lit-amp Have scanned "[literals&"
# Could be a start of "&&" operator or a literal
# In [abc&[def]], the '&' is a literal
#
set-lit-amp:
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
#
# set-set-dash
# Have scanned [set]-
# Could be a '-' difference operator, if a [set] follows.
# Could be the start of a '--' operator.
# Otherewise is a literal.
set-set-dash:
'[' n set-open ^set-after-set doSetBeginDifference1
'-' n set-after-op doSetDifference2
default set-after-lit doSetAddDash
#
# set-range-dash
# scanned a-b- or \w-
# any set or range like item where the trailing single '-' should
# be literal, not a set difference operation.
# A trailing "--" is still a difference operator.
set-range-dash:
'-' n set-after-op doSetDifference2
default set-after-lit doSetAddDash
set-range-amp:
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
# set-lit-dash
# Have scanned "[literals-" Could be a range or a -- operator or a literal
# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
# [abc-\p{xx} the '-' is an error
# [abc-] the '-' is a literal
# [ab-xy] the '-' is a range
#
set-lit-dash:
'-' n set-after-op doSetDifference2
'[' set-after-lit doSetAddDash
']' set-after-lit doSetAddDash
'\' n set-lit-dash-escape
default n set-after-range doSetRange
# set-lit-dash-escape
#
# scanned "[literal-\"
# Could be a range, if the \ introduces an escaped literal char or a named char.
# Otherwise it is an error.
#
set-lit-dash-escape:
's' errorDeath doSetOpError
'S' errorDeath doSetOpError
'w' errorDeath doSetOpError
'W' errorDeath doSetOpError
'd' errorDeath doSetOpError
'D' errorDeath doSetOpError
'N' set-after-range doSetNamedRange
default n set-after-range doSetRange
#
# set-escape
# Common back-slash escape processing within set expressions
#
set-escape:
'p' set-after-set doSetProp
'P' set-after-set doSetProp
'N' set-after-lit doSetNamedChar
's' n set-after-range doSetBackslash_s
'S' n set-after-range doSetBackslash_S
'w' n set-after-range doSetBackslash_w
'W' n set-after-range doSetBackslash_W
'd' n set-after-range doSetBackslash_d
'D' n set-after-range doSetBackslash_D
'h' n set-after-range doSetBackslash_h
'H' n set-after-range doSetBackslash_H
'v' n set-after-range doSetBackslash_v
'V' n set-after-range doSetBackslash_V
default n set-after-lit doSetLiteralEscaped
#
# set-finish
# Have just encountered the final ']' that completes a [set], and
# arrived here via a pop. From here, we exit the set parsing world, and go
# back to generic regular expression parsing.
#
set-finish:
default expr-quant doSetFinish
#
# errorDeath. This state is specified as the next state whenever a syntax error
# in the source rules is detected. Barring bugs, the state machine will never
# actually get here, but will stop because of the action associated with the error.
# But, just in case, this state asks the state machine to exit.
errorDeath:
default n errorDeath doExit