| // Copyright 2016 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/regexp/regexp-parser.h" |
| |
| #include <vector> |
| |
| #include "src/char-predicates-inl.h" |
| #include "src/factory.h" |
| #include "src/isolate.h" |
| #include "src/objects-inl.h" |
| #include "src/ostreams.h" |
| #include "src/regexp/jsregexp.h" |
| #include "src/utils.h" |
| |
| #ifdef V8_INTL_SUPPORT |
| #include "unicode/uniset.h" |
| // TODO(mathias): Remove this when we no longer need to check |
| // `U_ICU_VERSION_MAJOR_NUM`. |
| #include "unicode/uvernum.h" |
| #endif // V8_INTL_SUPPORT |
| |
| namespace v8 { |
| namespace internal { |
| |
| RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
| JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
| : isolate_(isolate), |
| zone_(zone), |
| error_(error), |
| captures_(nullptr), |
| named_captures_(nullptr), |
| named_back_references_(nullptr), |
| in_(in), |
| current_(kEndMarker), |
| top_level_flags_(flags), |
| next_pos_(0), |
| captures_started_(0), |
| capture_count_(0), |
| has_more_(true), |
| simple_(false), |
| contains_anchor_(false), |
| is_scanned_for_captures_(false), |
| has_named_captures_(false), |
| failed_(false) { |
| Advance(); |
| } |
| |
| template <bool update_position> |
| inline uc32 RegExpParser::ReadNext() { |
| int position = next_pos_; |
| uc32 c0 = in()->Get(position); |
| position++; |
| // Read the whole surrogate pair in case of unicode flag, if possible. |
| if (unicode() && position < in()->length() && |
| unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
| uc16 c1 = in()->Get(position); |
| if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
| c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
| position++; |
| } |
| } |
| if (update_position) next_pos_ = position; |
| return c0; |
| } |
| |
| |
| uc32 RegExpParser::Next() { |
| if (has_next()) { |
| return ReadNext<false>(); |
| } else { |
| return kEndMarker; |
| } |
| } |
| |
| void RegExpParser::Advance() { |
| if (has_next()) { |
| StackLimitCheck check(isolate()); |
| if (check.HasOverflowed()) { |
| if (FLAG_abort_on_stack_or_string_length_overflow) { |
| FATAL("Aborting on stack overflow"); |
| } |
| ReportError(CStrVector( |
| MessageTemplate::TemplateString(MessageTemplate::kStackOverflow))); |
| } else if (zone()->excess_allocation()) { |
| ReportError(CStrVector("Regular expression too large")); |
| } else { |
| current_ = ReadNext<true>(); |
| } |
| } else { |
| current_ = kEndMarker; |
| // Advance so that position() points to 1-after-the-last-character. This is |
| // important so that Reset() to this position works correctly. |
| next_pos_ = in()->length() + 1; |
| has_more_ = false; |
| } |
| } |
| |
| |
| void RegExpParser::Reset(int pos) { |
| next_pos_ = pos; |
| has_more_ = (pos < in()->length()); |
| Advance(); |
| } |
| |
| void RegExpParser::Advance(int dist) { |
| next_pos_ += dist - 1; |
| Advance(); |
| } |
| |
| |
| bool RegExpParser::simple() { return simple_; } |
| |
| bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
| switch (c) { |
| case '^': |
| case '$': |
| case '\\': |
| case '.': |
| case '*': |
| case '+': |
| case '?': |
| case '(': |
| case ')': |
| case '[': |
| case ']': |
| case '{': |
| case '}': |
| case '|': |
| case '/': |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| |
| RegExpTree* RegExpParser::ReportError(Vector<const char> message) { |
| if (failed_) return nullptr; // Do not overwrite any existing error. |
| failed_ = true; |
| *error_ = isolate() |
| ->factory() |
| ->NewStringFromOneByte(Vector<const uint8_t>::cast(message)) |
| .ToHandleChecked(); |
| // Zip to the end to make sure the no more input is read. |
| current_ = kEndMarker; |
| next_pos_ = in()->length(); |
| return nullptr; |
| } |
| |
| #define CHECK_FAILED /**/); \ |
| if (failed_) return nullptr; \ |
| ((void)0 |
| |
| // Pattern :: |
| // Disjunction |
| RegExpTree* RegExpParser::ParsePattern() { |
| RegExpTree* result = ParseDisjunction(CHECK_FAILED); |
| PatchNamedBackReferences(CHECK_FAILED); |
| DCHECK(!has_more()); |
| // If the result of parsing is a literal string atom, and it has the |
| // same length as the input, then the atom is identical to the input. |
| if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { |
| simple_ = true; |
| } |
| return result; |
| } |
| |
| |
| // Disjunction :: |
| // Alternative |
| // Alternative | Disjunction |
| // Alternative :: |
| // [empty] |
| // Term Alternative |
| // Term :: |
| // Assertion |
| // Atom |
| // Atom Quantifier |
| RegExpTree* RegExpParser::ParseDisjunction() { |
| // Used to store current state while parsing subexpressions. |
| RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, |
| 0, nullptr, top_level_flags_, zone()); |
| RegExpParserState* state = &initial_state; |
| // Cache the builder in a local variable for quick access. |
| RegExpBuilder* builder = initial_state.builder(); |
| while (true) { |
| switch (current()) { |
| case kEndMarker: |
| if (state->IsSubexpression()) { |
| // Inside a parenthesized group when hitting end of input. |
| return ReportError(CStrVector("Unterminated group")); |
| } |
| DCHECK_EQ(INITIAL, state->group_type()); |
| // Parsing completed successfully. |
| return builder->ToRegExp(); |
| case ')': { |
| if (!state->IsSubexpression()) { |
| return ReportError(CStrVector("Unmatched ')'")); |
| } |
| DCHECK_NE(INITIAL, state->group_type()); |
| |
| Advance(); |
| // End disjunction parsing and convert builder content to new single |
| // regexp atom. |
| RegExpTree* body = builder->ToRegExp(); |
| |
| int end_capture_index = captures_started(); |
| |
| int capture_index = state->capture_index(); |
| SubexpressionType group_type = state->group_type(); |
| |
| // Build result of subexpression. |
| if (group_type == CAPTURE) { |
| if (state->IsNamedCapture()) { |
| CreateNamedCaptureAtIndex(state->capture_name(), |
| capture_index CHECK_FAILED); |
| } |
| RegExpCapture* capture = GetCapture(capture_index); |
| capture->set_body(body); |
| body = capture; |
| } else if (group_type == GROUPING) { |
| body = new (zone()) RegExpGroup(body); |
| } else { |
| DCHECK(group_type == POSITIVE_LOOKAROUND || |
| group_type == NEGATIVE_LOOKAROUND); |
| bool is_positive = (group_type == POSITIVE_LOOKAROUND); |
| body = new (zone()) RegExpLookaround( |
| body, is_positive, end_capture_index - capture_index, |
| capture_index, state->lookaround_type()); |
| } |
| |
| // Restore previous state. |
| state = state->previous_state(); |
| builder = state->builder(); |
| |
| builder->AddAtom(body); |
| // For compatibility with JSC and ES3, we allow quantifiers after |
| // lookaheads, and break in all cases. |
| break; |
| } |
| case '|': { |
| Advance(); |
| builder->NewAlternative(); |
| continue; |
| } |
| case '*': |
| case '+': |
| case '?': |
| return ReportError(CStrVector("Nothing to repeat")); |
| case '^': { |
| Advance(); |
| if (builder->multiline()) { |
| builder->AddAssertion(new (zone()) RegExpAssertion( |
| RegExpAssertion::START_OF_LINE, builder->flags())); |
| } else { |
| builder->AddAssertion(new (zone()) RegExpAssertion( |
| RegExpAssertion::START_OF_INPUT, builder->flags())); |
| set_contains_anchor(); |
| } |
| continue; |
| } |
| case '$': { |
| Advance(); |
| RegExpAssertion::AssertionType assertion_type = |
| builder->multiline() ? RegExpAssertion::END_OF_LINE |
| : RegExpAssertion::END_OF_INPUT; |
| builder->AddAssertion( |
| new (zone()) RegExpAssertion(assertion_type, builder->flags())); |
| continue; |
| } |
| case '.': { |
| Advance(); |
| ZoneList<CharacterRange>* ranges = |
| new (zone()) ZoneList<CharacterRange>(2, zone()); |
| |
| if (builder->dotall()) { |
| // Everything. |
| CharacterRange::AddClassEscape('*', ranges, false, zone()); |
| } else { |
| // Everything except \x0A, \x0D, \u2028 and \u2029 |
| CharacterRange::AddClassEscape('.', ranges, false, zone()); |
| } |
| |
| RegExpCharacterClass* cc = |
| new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags()); |
| builder->AddCharacterClass(cc); |
| break; |
| } |
| case '(': { |
| state = ParseOpenParenthesis(state CHECK_FAILED); |
| builder = state->builder(); |
| continue; |
| } |
| case '[': { |
| RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); |
| builder->AddCharacterClass(cc->AsCharacterClass()); |
| break; |
| } |
| // Atom :: |
| // \ AtomEscape |
| case '\\': |
| switch (Next()) { |
| case kEndMarker: |
| return ReportError(CStrVector("\\ at end of pattern")); |
| case 'b': |
| Advance(2); |
| builder->AddAssertion(new (zone()) RegExpAssertion( |
| RegExpAssertion::BOUNDARY, builder->flags())); |
| continue; |
| case 'B': |
| Advance(2); |
| builder->AddAssertion(new (zone()) RegExpAssertion( |
| RegExpAssertion::NON_BOUNDARY, builder->flags())); |
| continue; |
| // AtomEscape :: |
| // CharacterClassEscape |
| // |
| // CharacterClassEscape :: one of |
| // d D s S w W |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': |
| case 'w': |
| case 'W': { |
| uc32 c = Next(); |
| Advance(2); |
| ZoneList<CharacterRange>* ranges = |
| new (zone()) ZoneList<CharacterRange>(2, zone()); |
| CharacterRange::AddClassEscape( |
| c, ranges, unicode() && builder->ignore_case(), zone()); |
| RegExpCharacterClass* cc = new (zone()) |
| RegExpCharacterClass(zone(), ranges, builder->flags()); |
| builder->AddCharacterClass(cc); |
| break; |
| } |
| case 'p': |
| case 'P': { |
| uc32 p = Next(); |
| Advance(2); |
| if (unicode()) { |
| if (FLAG_harmony_regexp_property) { |
| ZoneList<CharacterRange>* ranges = |
| new (zone()) ZoneList<CharacterRange>(2, zone()); |
| if (!ParsePropertyClass(ranges, p == 'P')) { |
| return ReportError(CStrVector("Invalid property name")); |
| } |
| RegExpCharacterClass* cc = new (zone()) |
| RegExpCharacterClass(zone(), ranges, builder->flags()); |
| builder->AddCharacterClass(cc); |
| } else { |
| // With /u, no identity escapes except for syntax characters |
| // are allowed. Otherwise, all identity escapes are allowed. |
| return ReportError(CStrVector("Invalid escape")); |
| } |
| } else { |
| builder->AddCharacter(p); |
| } |
| break; |
| } |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| int index = 0; |
| bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); |
| if (is_backref) { |
| if (state->IsInsideCaptureGroup(index)) { |
| // The back reference is inside the capture group it refers to. |
| // Nothing can possibly have been captured yet, so we use empty |
| // instead. This ensures that, when checking a back reference, |
| // the capture registers of the referenced capture are either |
| // both set or both cleared. |
| builder->AddEmpty(); |
| } else { |
| RegExpCapture* capture = GetCapture(index); |
| RegExpTree* atom = |
| new (zone()) RegExpBackReference(capture, builder->flags()); |
| builder->AddAtom(atom); |
| } |
| break; |
| } |
| // With /u, no identity escapes except for syntax characters |
| // are allowed. Otherwise, all identity escapes are allowed. |
| if (unicode()) { |
| return ReportError(CStrVector("Invalid escape")); |
| } |
| uc32 first_digit = Next(); |
| if (first_digit == '8' || first_digit == '9') { |
| builder->AddCharacter(first_digit); |
| Advance(2); |
| break; |
| } |
| } |
| // Fall through. |
| case '0': { |
| Advance(); |
| if (unicode() && Next() >= '0' && Next() <= '9') { |
| // With /u, decimal escape with leading 0 are not parsed as octal. |
| return ReportError(CStrVector("Invalid decimal escape")); |
| } |
| uc32 octal = ParseOctalLiteral(); |
| builder->AddCharacter(octal); |
| break; |
| } |
| // ControlEscape :: one of |
| // f n r t v |
| case 'f': |
| Advance(2); |
| builder->AddCharacter('\f'); |
| break; |
| case 'n': |
| Advance(2); |
| builder->AddCharacter('\n'); |
| break; |
| case 'r': |
| Advance(2); |
| builder->AddCharacter('\r'); |
| break; |
| case 't': |
| Advance(2); |
| builder->AddCharacter('\t'); |
| break; |
| case 'v': |
| Advance(2); |
| builder->AddCharacter('\v'); |
| break; |
| case 'c': { |
| Advance(); |
| uc32 controlLetter = Next(); |
| // Special case if it is an ASCII letter. |
| // Convert lower case letters to uppercase. |
| uc32 letter = controlLetter & ~('a' ^ 'A'); |
| if (letter < 'A' || 'Z' < letter) { |
| // controlLetter is not in range 'A'-'Z' or 'a'-'z'. |
| // Read the backslash as a literal character instead of as |
| // starting an escape. |
| // ES#prod-annexB-ExtendedPatternCharacter |
| if (unicode()) { |
| // With /u, invalid escapes are not treated as identity escapes. |
| return ReportError(CStrVector("Invalid unicode escape")); |
| } |
| builder->AddCharacter('\\'); |
| } else { |
| Advance(2); |
| builder->AddCharacter(controlLetter & 0x1F); |
| } |
| break; |
| } |
| case 'x': { |
| Advance(2); |
| uc32 value; |
| if (ParseHexEscape(2, &value)) { |
| builder->AddCharacter(value); |
| } else if (!unicode()) { |
| builder->AddCharacter('x'); |
| } else { |
| // With /u, invalid escapes are not treated as identity escapes. |
| return ReportError(CStrVector("Invalid escape")); |
| } |
| break; |
| } |
| case 'u': { |
| Advance(2); |
| uc32 value; |
| if (ParseUnicodeEscape(&value)) { |
| builder->AddEscapedUnicodeCharacter(value); |
| } else if (!unicode()) { |
| builder->AddCharacter('u'); |
| } else { |
| // With /u, invalid escapes are not treated as identity escapes. |
| return ReportError(CStrVector("Invalid unicode escape")); |
| } |
| break; |
| } |
| case 'k': |
| // Either an identity escape or a named back-reference. The two |
| // interpretations are mutually exclusive: '\k' is interpreted as |
| // an identity escape for non-unicode patterns without named |
| // capture groups, and as the beginning of a named back-reference |
| // in all other cases. |
| if (FLAG_harmony_regexp_named_captures && |
| (unicode() || HasNamedCaptures())) { |
| Advance(2); |
| ParseNamedBackReference(builder, state CHECK_FAILED); |
| break; |
| } |
| // Fall through. |
| default: |
| Advance(); |
| // With /u, no identity escapes except for syntax characters |
| // are allowed. Otherwise, all identity escapes are allowed. |
| if (!unicode() || IsSyntaxCharacterOrSlash(current())) { |
| builder->AddCharacter(current()); |
| Advance(); |
| } else { |
| return ReportError(CStrVector("Invalid escape")); |
| } |
| break; |
| } |
| break; |
| case '{': { |
| int dummy; |
| bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); |
| if (parsed) return ReportError(CStrVector("Nothing to repeat")); |
| // Fall through. |
| } |
| case '}': |
| case ']': |
| if (unicode()) { |
| return ReportError(CStrVector("Lone quantifier brackets")); |
| } |
| // Fall through. |
| default: |
| builder->AddUnicodeCharacter(current()); |
| Advance(); |
| break; |
| } // end switch(current()) |
| |
| int min; |
| int max; |
| switch (current()) { |
| // QuantifierPrefix :: |
| // * |
| // + |
| // ? |
| // { |
| case '*': |
| min = 0; |
| max = RegExpTree::kInfinity; |
| Advance(); |
| break; |
| case '+': |
| min = 1; |
| max = RegExpTree::kInfinity; |
| Advance(); |
| break; |
| case '?': |
| min = 0; |
| max = 1; |
| Advance(); |
| break; |
| case '{': |
| if (ParseIntervalQuantifier(&min, &max)) { |
| if (max < min) { |
| return ReportError( |
| CStrVector("numbers out of order in {} quantifier")); |
| } |
| break; |
| } else if (unicode()) { |
| // With /u, incomplete quantifiers are not allowed. |
| return ReportError(CStrVector("Incomplete quantifier")); |
| } |
| continue; |
| default: |
| continue; |
| } |
| RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; |
| if (current() == '?') { |
| quantifier_type = RegExpQuantifier::NON_GREEDY; |
| Advance(); |
| } else if (FLAG_regexp_possessive_quantifier && current() == '+') { |
| // FLAG_regexp_possessive_quantifier is a debug-only flag. |
| quantifier_type = RegExpQuantifier::POSSESSIVE; |
| Advance(); |
| } |
| if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
| return ReportError(CStrVector("Invalid quantifier")); |
| } |
| } |
| } |
| |
| RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( |
| RegExpParserState* state) { |
| RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
| bool is_named_capture = false; |
| JSRegExp::Flags switch_on = JSRegExp::kNone; |
| JSRegExp::Flags switch_off = JSRegExp::kNone; |
| const ZoneVector<uc16>* capture_name = nullptr; |
| SubexpressionType subexpr_type = CAPTURE; |
| Advance(); |
| if (current() == '?') { |
| switch (Next()) { |
| case ':': |
| Advance(2); |
| subexpr_type = GROUPING; |
| break; |
| case '=': |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = POSITIVE_LOOKAROUND; |
| break; |
| case '!': |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = NEGATIVE_LOOKAROUND; |
| break; |
| case '-': |
| case 'i': |
| case 's': |
| case 'm': { |
| if (!FLAG_regexp_mode_modifiers) { |
| ReportError(CStrVector("Invalid group")); |
| return nullptr; |
| } |
| Advance(); |
| bool flags_sense = true; // Switching on flags. |
| while (subexpr_type != GROUPING) { |
| switch (current()) { |
| case '-': |
| if (!flags_sense) { |
| ReportError(CStrVector("Multiple dashes in flag group")); |
| return nullptr; |
| } |
| flags_sense = false; |
| Advance(); |
| continue; |
| case 's': |
| case 'i': |
| case 'm': { |
| JSRegExp::Flags bit = JSRegExp::kUnicode; |
| if (current() == 'i') bit = JSRegExp::kIgnoreCase; |
| if (current() == 'm') bit = JSRegExp::kMultiline; |
| if (current() == 's') bit = JSRegExp::kDotAll; |
| if (((switch_on | switch_off) & bit) != 0) { |
| ReportError(CStrVector("Repeated flag in flag group")); |
| return nullptr; |
| } |
| if (flags_sense) { |
| switch_on |= bit; |
| } else { |
| switch_off |= bit; |
| } |
| Advance(); |
| continue; |
| } |
| case ')': { |
| Advance(); |
| state->builder() |
| ->FlushText(); // Flush pending text using old flags. |
| // These (?i)-style flag switches don't put us in a subexpression |
| // at all, they just modify the flags in the rest of the current |
| // subexpression. |
| JSRegExp::Flags flags = |
| (state->builder()->flags() | switch_on) & ~switch_off; |
| state->builder()->set_flags(flags); |
| return state; |
| } |
| case ':': |
| Advance(); |
| subexpr_type = GROUPING; // Will break us out of the outer loop. |
| continue; |
| default: |
| ReportError(CStrVector("Invalid flag group")); |
| return nullptr; |
| } |
| } |
| break; |
| } |
| case '<': |
| Advance(); |
| if (Next() == '=') { |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKBEHIND; |
| subexpr_type = POSITIVE_LOOKAROUND; |
| break; |
| } else if (Next() == '!') { |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKBEHIND; |
| subexpr_type = NEGATIVE_LOOKAROUND; |
| break; |
| } |
| if (FLAG_harmony_regexp_named_captures) { |
| is_named_capture = true; |
| has_named_captures_ = true; |
| Advance(); |
| break; |
| } |
| // Fall through. |
| default: |
| ReportError(CStrVector("Invalid group")); |
| return nullptr; |
| } |
| } |
| if (subexpr_type == CAPTURE) { |
| if (captures_started_ >= kMaxCaptures) { |
| ReportError(CStrVector("Too many captures")); |
| return nullptr; |
| } |
| captures_started_++; |
| |
| if (is_named_capture) { |
| capture_name = ParseCaptureGroupName(CHECK_FAILED); |
| } |
| } |
| JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off; |
| // Store current state and begin new disjunction parsing. |
| return new (zone()) |
| RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, |
| capture_name, flags, zone()); |
| } |
| |
| #ifdef DEBUG |
| // Currently only used in an DCHECK. |
| static bool IsSpecialClassEscape(uc32 c) { |
| switch (c) { |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': |
| case 'w': |
| case 'W': |
| return true; |
| default: |
| return false; |
| } |
| } |
| #endif |
| |
| |
| // In order to know whether an escape is a backreference or not we have to scan |
| // the entire regexp and find the number of capturing parentheses. However we |
| // don't want to scan the regexp twice unless it is necessary. This mini-parser |
| // is called when needed. It can see the difference between capturing and |
| // noncapturing parentheses and can skip character classes and backslash-escaped |
| // characters. |
| void RegExpParser::ScanForCaptures() { |
| DCHECK(!is_scanned_for_captures_); |
| const int saved_position = position(); |
| // Start with captures started previous to current position |
| int capture_count = captures_started(); |
| // Add count of captures after this position. |
| int n; |
| while ((n = current()) != kEndMarker) { |
| Advance(); |
| switch (n) { |
| case '\\': |
| Advance(); |
| break; |
| case '[': { |
| int c; |
| while ((c = current()) != kEndMarker) { |
| Advance(); |
| if (c == '\\') { |
| Advance(); |
| } else { |
| if (c == ']') break; |
| } |
| } |
| break; |
| } |
| case '(': |
| if (current() == '?') { |
| // At this point we could be in |
| // * a non-capturing group '(:', |
| // * a lookbehind assertion '(?<=' '(?<!' |
| // * or a named capture '(?<'. |
| // |
| // Of these, only named captures are capturing groups. |
| if (!FLAG_harmony_regexp_named_captures) break; |
| |
| Advance(); |
| if (current() != '<') break; |
| |
| Advance(); |
| if (current() == '=' || current() == '!') break; |
| |
| // Found a possible named capture. It could turn out to be a syntax |
| // error (e.g. an unterminated or invalid name), but that distinction |
| // does not matter for our purposes. |
| has_named_captures_ = true; |
| } |
| capture_count++; |
| break; |
| } |
| } |
| capture_count_ = capture_count; |
| is_scanned_for_captures_ = true; |
| Reset(saved_position); |
| } |
| |
| |
| bool RegExpParser::ParseBackReferenceIndex(int* index_out) { |
| DCHECK_EQ('\\', current()); |
| DCHECK('1' <= Next() && Next() <= '9'); |
| // Try to parse a decimal literal that is no greater than the total number |
| // of left capturing parentheses in the input. |
| int start = position(); |
| int value = Next() - '0'; |
| Advance(2); |
| while (true) { |
| uc32 c = current(); |
| if (IsDecimalDigit(c)) { |
| value = 10 * value + (c - '0'); |
| if (value > kMaxCaptures) { |
| Reset(start); |
| return false; |
| } |
| Advance(); |
| } else { |
| break; |
| } |
| } |
| if (value > captures_started()) { |
| if (!is_scanned_for_captures_) ScanForCaptures(); |
| if (value > capture_count_) { |
| Reset(start); |
| return false; |
| } |
| } |
| *index_out = value; |
| return true; |
| } |
| |
| static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { |
| if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| v->push_back(code_unit); |
| } else { |
| v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
| v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
| } |
| } |
| |
| const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { |
| DCHECK(FLAG_harmony_regexp_named_captures); |
| |
| ZoneVector<uc16>* name = |
| new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); |
| |
| bool at_start = true; |
| while (true) { |
| uc32 c = current(); |
| Advance(); |
| |
| // Convert unicode escapes. |
| if (c == '\\' && current() == 'u') { |
| Advance(); |
| if (!ParseUnicodeEscape(&c)) { |
| ReportError(CStrVector("Invalid Unicode escape sequence")); |
| return nullptr; |
| } |
| } |
| |
| // The backslash char is misclassified as both ID_Start and ID_Continue. |
| if (c == '\\') { |
| ReportError(CStrVector("Invalid capture group name")); |
| return nullptr; |
| } |
| |
| if (at_start) { |
| if (!IdentifierStart::Is(c)) { |
| ReportError(CStrVector("Invalid capture group name")); |
| return nullptr; |
| } |
| push_code_unit(name, c); |
| at_start = false; |
| } else { |
| if (c == '>') { |
| break; |
| } else if (IdentifierPart::Is(c)) { |
| push_code_unit(name, c); |
| } else { |
| ReportError(CStrVector("Invalid capture group name")); |
| return nullptr; |
| } |
| } |
| } |
| |
| return name; |
| } |
| |
| bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, |
| int index) { |
| DCHECK(FLAG_harmony_regexp_named_captures); |
| DCHECK(0 < index && index <= captures_started_); |
| DCHECK_NOT_NULL(name); |
| |
| if (named_captures_ == nullptr) { |
| named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone()); |
| } else { |
| // Check for duplicates and bail if we find any. |
| // TODO(jgruber): O(n^2). |
| for (const auto& named_capture : *named_captures_) { |
| if (*named_capture->name() == *name) { |
| ReportError(CStrVector("Duplicate capture group name")); |
| return false; |
| } |
| } |
| } |
| |
| RegExpCapture* capture = GetCapture(index); |
| DCHECK_NULL(capture->name()); |
| |
| capture->set_name(name); |
| named_captures_->Add(capture, zone()); |
| |
| return true; |
| } |
| |
| bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, |
| RegExpParserState* state) { |
| // The parser is assumed to be on the '<' in \k<name>. |
| if (current() != '<') { |
| ReportError(CStrVector("Invalid named reference")); |
| return false; |
| } |
| |
| Advance(); |
| const ZoneVector<uc16>* name = ParseCaptureGroupName(); |
| if (name == nullptr) { |
| return false; |
| } |
| |
| if (state->IsInsideCaptureGroup(name)) { |
| builder->AddEmpty(); |
| } else { |
| RegExpBackReference* atom = |
| new (zone()) RegExpBackReference(builder->flags()); |
| atom->set_name(name); |
| |
| builder->AddAtom(atom); |
| |
| if (named_back_references_ == nullptr) { |
| named_back_references_ = |
| new (zone()) ZoneList<RegExpBackReference*>(1, zone()); |
| } |
| named_back_references_->Add(atom, zone()); |
| } |
| |
| return true; |
| } |
| |
| void RegExpParser::PatchNamedBackReferences() { |
| if (named_back_references_ == nullptr) return; |
| |
| if (named_captures_ == nullptr) { |
| ReportError(CStrVector("Invalid named capture referenced")); |
| return; |
| } |
| |
| // Look up and patch the actual capture for each named back reference. |
| // TODO(jgruber): O(n^2), optimize if necessary. |
| |
| for (int i = 0; i < named_back_references_->length(); i++) { |
| RegExpBackReference* ref = named_back_references_->at(i); |
| |
| int index = -1; |
| for (const auto& capture : *named_captures_) { |
| if (*capture->name() == *ref->name()) { |
| index = capture->index(); |
| break; |
| } |
| } |
| |
| if (index == -1) { |
| ReportError(CStrVector("Invalid named capture referenced")); |
| return; |
| } |
| |
| ref->set_capture(GetCapture(index)); |
| } |
| } |
| |
| RegExpCapture* RegExpParser::GetCapture(int index) { |
| // The index for the capture groups are one-based. Its index in the list is |
| // zero-based. |
| int know_captures = |
| is_scanned_for_captures_ ? capture_count_ : captures_started_; |
| DCHECK(index <= know_captures); |
| if (captures_ == nullptr) { |
| captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); |
| } |
| while (captures_->length() < know_captures) { |
| captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); |
| } |
| return captures_->at(index - 1); |
| } |
| |
| Handle<FixedArray> RegExpParser::CreateCaptureNameMap() { |
| if (named_captures_ == nullptr || named_captures_->is_empty()) |
| return Handle<FixedArray>(); |
| |
| Factory* factory = isolate()->factory(); |
| |
| int len = named_captures_->length() * 2; |
| Handle<FixedArray> array = factory->NewFixedArray(len); |
| |
| for (int i = 0; i < named_captures_->length(); i++) { |
| RegExpCapture* capture = named_captures_->at(i); |
| MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name()); |
| array->set(i * 2, *name.ToHandleChecked()); |
| array->set(i * 2 + 1, Smi::FromInt(capture->index())); |
| } |
| |
| return array; |
| } |
| |
| bool RegExpParser::HasNamedCaptures() { |
| if (has_named_captures_ || is_scanned_for_captures_) { |
| return has_named_captures_; |
| } |
| |
| ScanForCaptures(); |
| DCHECK(is_scanned_for_captures_); |
| return has_named_captures_; |
| } |
| |
| bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { |
| for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { |
| if (s->group_type() != CAPTURE) continue; |
| // Return true if we found the matching capture index. |
| if (index == s->capture_index()) return true; |
| // Abort if index is larger than what has been parsed up till this state. |
| if (index > s->capture_index()) return false; |
| } |
| return false; |
| } |
| |
| bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( |
| const ZoneVector<uc16>* name) { |
| DCHECK_NOT_NULL(name); |
| for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { |
| if (s->capture_name() == nullptr) continue; |
| if (*s->capture_name() == *name) return true; |
| } |
| return false; |
| } |
| |
| // QuantifierPrefix :: |
| // { DecimalDigits } |
| // { DecimalDigits , } |
| // { DecimalDigits , DecimalDigits } |
| // |
| // Returns true if parsing succeeds, and set the min_out and max_out |
| // values. Values are truncated to RegExpTree::kInfinity if they overflow. |
| bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { |
| DCHECK_EQ(current(), '{'); |
| int start = position(); |
| Advance(); |
| int min = 0; |
| if (!IsDecimalDigit(current())) { |
| Reset(start); |
| return false; |
| } |
| while (IsDecimalDigit(current())) { |
| int next = current() - '0'; |
| if (min > (RegExpTree::kInfinity - next) / 10) { |
| // Overflow. Skip past remaining decimal digits and return -1. |
| do { |
| Advance(); |
| } while (IsDecimalDigit(current())); |
| min = RegExpTree::kInfinity; |
| break; |
| } |
| min = 10 * min + next; |
| Advance(); |
| } |
| int max = 0; |
| if (current() == '}') { |
| max = min; |
| Advance(); |
| } else if (current() == ',') { |
| Advance(); |
| if (current() == '}') { |
| max = RegExpTree::kInfinity; |
| Advance(); |
| } else { |
| while (IsDecimalDigit(current())) { |
| int next = current() - '0'; |
| if (max > (RegExpTree::kInfinity - next) / 10) { |
| do { |
| Advance(); |
| } while (IsDecimalDigit(current())); |
| max = RegExpTree::kInfinity; |
| break; |
| } |
| max = 10 * max + next; |
| Advance(); |
| } |
| if (current() != '}') { |
| Reset(start); |
| return false; |
| } |
| Advance(); |
| } |
| } else { |
| Reset(start); |
| return false; |
| } |
| *min_out = min; |
| *max_out = max; |
| return true; |
| } |
| |
| |
| uc32 RegExpParser::ParseOctalLiteral() { |
| DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); |
| // For compatibility with some other browsers (not all), we parse |
| // up to three octal digits with a value below 256. |
| // ES#prod-annexB-LegacyOctalEscapeSequence |
| uc32 value = current() - '0'; |
| Advance(); |
| if ('0' <= current() && current() <= '7') { |
| value = value * 8 + current() - '0'; |
| Advance(); |
| if (value < 32 && '0' <= current() && current() <= '7') { |
| value = value * 8 + current() - '0'; |
| Advance(); |
| } |
| } |
| return value; |
| } |
| |
| |
| bool RegExpParser::ParseHexEscape(int length, uc32* value) { |
| int start = position(); |
| uc32 val = 0; |
| for (int i = 0; i < length; ++i) { |
| uc32 c = current(); |
| int d = HexValue(c); |
| if (d < 0) { |
| Reset(start); |
| return false; |
| } |
| val = val * 16 + d; |
| Advance(); |
| } |
| *value = val; |
| return true; |
| } |
| |
| // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
| bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| // allowed). In the latter case, the number of hex digits between { } is |
| // arbitrary. \ and u have already been read. |
| if (current() == '{' && unicode()) { |
| int start = position(); |
| Advance(); |
| if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { |
| if (current() == '}') { |
| Advance(); |
| return true; |
| } |
| } |
| Reset(start); |
| return false; |
| } |
| // \u but no {, or \u{...} escapes not allowed. |
| bool result = ParseHexEscape(4, value); |
| if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| current() == '\\') { |
| // Attempt to read trail surrogate. |
| int start = position(); |
| if (Next() == 'u') { |
| Advance(2); |
| uc32 trail; |
| if (ParseHexEscape(4, &trail) && |
| unibrow::Utf16::IsTrailSurrogate(trail)) { |
| *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), |
| static_cast<uc16>(trail)); |
| return true; |
| } |
| } |
| Reset(start); |
| } |
| return result; |
| } |
| |
| #ifdef V8_INTL_SUPPORT |
| |
| namespace { |
| |
| bool IsExactPropertyAlias(const char* property_name, UProperty property) { |
| const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); |
| if (short_name != nullptr && strcmp(property_name, short_name) == 0) |
| return true; |
| for (int i = 0;; i++) { |
| const char* long_name = u_getPropertyName( |
| property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
| if (long_name == nullptr) break; |
| if (strcmp(property_name, long_name) == 0) return true; |
| } |
| return false; |
| } |
| |
| bool IsExactPropertyValueAlias(const char* property_value_name, |
| UProperty property, int32_t property_value) { |
| const char* short_name = |
| u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); |
| if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { |
| return true; |
| } |
| for (int i = 0;; i++) { |
| const char* long_name = u_getPropertyValueName( |
| property, property_value, |
| static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
| if (long_name == nullptr) break; |
| if (strcmp(property_value_name, long_name) == 0) return true; |
| } |
| return false; |
| } |
| |
| bool LookupPropertyValueName(UProperty property, |
| const char* property_value_name, bool negate, |
| ZoneList<CharacterRange>* result, Zone* zone) { |
| UProperty property_for_lookup = property; |
| if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { |
| // For the property Script_Extensions, we have to do the property value |
| // name lookup as if the property is Script. |
| property_for_lookup = UCHAR_SCRIPT; |
| } |
| int32_t property_value = |
| u_getPropertyValueEnum(property_for_lookup, property_value_name); |
| if (property_value == UCHAR_INVALID_CODE) return false; |
| |
| // We require the property name to match exactly to one of the property value |
| // aliases. However, u_getPropertyValueEnum uses loose matching. |
| if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, |
| property_value)) { |
| return false; |
| } |
| |
| UErrorCode ec = U_ZERO_ERROR; |
| icu::UnicodeSet set; |
| set.applyIntPropertyValue(property, property_value, ec); |
| bool success = ec == U_ZERO_ERROR && !set.isEmpty(); |
| |
| if (success) { |
| set.removeAllStrings(); |
| if (negate) set.complement(); |
| for (int i = 0; i < set.getRangeCount(); i++) { |
| result->Add( |
| CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), |
| zone); |
| } |
| } |
| return success; |
| } |
| |
| template <size_t N> |
| inline bool NameEquals(const char* name, const char (&literal)[N]) { |
| return strncmp(name, literal, N + 1) == 0; |
| } |
| |
| bool LookupSpecialPropertyValueName(const char* name, |
| ZoneList<CharacterRange>* result, |
| bool negate, Zone* zone) { |
| if (NameEquals(name, "Any")) { |
| if (negate) { |
| // Leave the list of character ranges empty, since the negation of 'Any' |
| // is the empty set. |
| } else { |
| result->Add(CharacterRange::Everything(), zone); |
| } |
| } else if (NameEquals(name, "ASCII")) { |
| result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) |
| : CharacterRange::Range(0x0, 0x7F), |
| zone); |
| } else if (NameEquals(name, "Assigned")) { |
| return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", |
| !negate, result, zone); |
| } else { |
| return false; |
| } |
| return true; |
| } |
| |
| // Explicitly whitelist supported binary properties. The spec forbids supporting |
| // properties outside of this set to ensure interoperability. |
| bool IsSupportedBinaryProperty(UProperty property) { |
| switch (property) { |
| case UCHAR_ALPHABETIC: |
| // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. |
| // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. |
| case UCHAR_ASCII_HEX_DIGIT: |
| // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. |
| case UCHAR_BIDI_CONTROL: |
| case UCHAR_BIDI_MIRRORED: |
| case UCHAR_CASE_IGNORABLE: |
| case UCHAR_CASED: |
| case UCHAR_CHANGES_WHEN_CASEFOLDED: |
| case UCHAR_CHANGES_WHEN_CASEMAPPED: |
| case UCHAR_CHANGES_WHEN_LOWERCASED: |
| case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: |
| case UCHAR_CHANGES_WHEN_TITLECASED: |
| case UCHAR_CHANGES_WHEN_UPPERCASED: |
| case UCHAR_DASH: |
| case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: |
| case UCHAR_DEPRECATED: |
| case UCHAR_DIACRITIC: |
| case UCHAR_EMOJI: |
| #if U_ICU_VERSION_MAJOR_NUM >= 60 |
| case UCHAR_EMOJI_COMPONENT: |
| #endif |
| case UCHAR_EMOJI_MODIFIER_BASE: |
| case UCHAR_EMOJI_MODIFIER: |
| case UCHAR_EMOJI_PRESENTATION: |
| case UCHAR_EXTENDER: |
| case UCHAR_GRAPHEME_BASE: |
| case UCHAR_GRAPHEME_EXTEND: |
| case UCHAR_HEX_DIGIT: |
| case UCHAR_ID_CONTINUE: |
| case UCHAR_ID_START: |
| case UCHAR_IDEOGRAPHIC: |
| case UCHAR_IDS_BINARY_OPERATOR: |
| case UCHAR_IDS_TRINARY_OPERATOR: |
| case UCHAR_JOIN_CONTROL: |
| case UCHAR_LOGICAL_ORDER_EXCEPTION: |
| case UCHAR_LOWERCASE: |
| case UCHAR_MATH: |
| case UCHAR_NONCHARACTER_CODE_POINT: |
| case UCHAR_PATTERN_SYNTAX: |
| case UCHAR_PATTERN_WHITE_SPACE: |
| case UCHAR_QUOTATION_MARK: |
| case UCHAR_RADICAL: |
| #if U_ICU_VERSION_MAJOR_NUM >= 60 |
| case UCHAR_REGIONAL_INDICATOR: |
| #endif |
| case UCHAR_S_TERM: |
| case UCHAR_SOFT_DOTTED: |
| case UCHAR_TERMINAL_PUNCTUATION: |
| case UCHAR_UNIFIED_IDEOGRAPH: |
| case UCHAR_UPPERCASE: |
| case UCHAR_VARIATION_SELECTOR: |
| case UCHAR_WHITE_SPACE: |
| case UCHAR_XID_CONTINUE: |
| case UCHAR_XID_START: |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| bool IsUnicodePropertyValueCharacter(char c) { |
| // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ |
| // |
| // Note that using this to validate each parsed char is quite conservative. |
| // A possible alternative solution would be to only ensure the parsed |
| // property name/value candidate string does not contain '\0' characters and |
| // let ICU lookups trigger the final failure. |
| if ('a' <= c && c <= 'z') return true; |
| if ('A' <= c && c <= 'Z') return true; |
| if ('0' <= c && c <= '9') return true; |
| return (c == '_'); |
| } |
| |
| } // anonymous namespace |
| |
| bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, |
| bool negate) { |
| // Parse the property class as follows: |
| // - In \p{name}, 'name' is interpreted |
| // - either as a general category property value name. |
| // - or as a binary property name. |
| // - In \p{name=value}, 'name' is interpreted as an enumerated property name, |
| // and 'value' is interpreted as one of the available property value names. |
| // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. |
| // - Loose matching is not applied. |
| std::vector<char> first_part; |
| std::vector<char> second_part; |
| if (current() == '{') { |
| // Parse \p{[PropertyName=]PropertyNameValue} |
| for (Advance(); current() != '}' && current() != '='; Advance()) { |
| if (!IsUnicodePropertyValueCharacter(current())) return false; |
| if (!has_next()) return false; |
| first_part.push_back(static_cast<char>(current())); |
| } |
| if (current() == '=') { |
| for (Advance(); current() != '}'; Advance()) { |
| if (!IsUnicodePropertyValueCharacter(current())) return false; |
| if (!has_next()) return false; |
| second_part.push_back(static_cast<char>(current())); |
| } |
| second_part.push_back(0); // null-terminate string. |
| } |
| } else { |
| return false; |
| } |
| Advance(); |
| first_part.push_back(0); // null-terminate string. |
| |
| DCHECK(first_part.size() - 1 == std::strlen(first_part.data())); |
| DCHECK(second_part.empty() || |
| second_part.size() - 1 == std::strlen(second_part.data())); |
| |
| if (second_part.empty()) { |
| // First attempt to interpret as general category property value name. |
| const char* name = first_part.data(); |
| if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, |
| result, zone())) { |
| return true; |
| } |
| // Interpret "Any", "ASCII", and "Assigned". |
| if (LookupSpecialPropertyValueName(name, result, negate, zone())) { |
| return true; |
| } |
| // Then attempt to interpret as binary property name with value name 'Y'. |
| UProperty property = u_getPropertyEnum(name); |
| if (!IsSupportedBinaryProperty(property)) return false; |
| if (!IsExactPropertyAlias(name, property)) return false; |
| return LookupPropertyValueName(property, negate ? "N" : "Y", false, result, |
| zone()); |
| } else { |
| // Both property name and value name are specified. Attempt to interpret |
| // the property name as enumerated property. |
| const char* property_name = first_part.data(); |
| const char* value_name = second_part.data(); |
| UProperty property = u_getPropertyEnum(property_name); |
| if (!IsExactPropertyAlias(property_name, property)) return false; |
| if (property == UCHAR_GENERAL_CATEGORY) { |
| // We want to allow aggregate value names such as "Letter". |
| property = UCHAR_GENERAL_CATEGORY_MASK; |
| } else if (property != UCHAR_SCRIPT && |
| property != UCHAR_SCRIPT_EXTENSIONS) { |
| return false; |
| } |
| return LookupPropertyValueName(property, value_name, negate, result, |
| zone()); |
| } |
| } |
| |
| #else // V8_INTL_SUPPORT |
| |
| bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result, |
| bool negate) { |
| return false; |
| } |
| |
| #endif // V8_INTL_SUPPORT |
| |
| bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { |
| uc32 x = 0; |
| int d = HexValue(current()); |
| if (d < 0) { |
| return false; |
| } |
| while (d >= 0) { |
| x = x * 16 + d; |
| if (x > max_value) { |
| return false; |
| } |
| Advance(); |
| d = HexValue(current()); |
| } |
| *value = x; |
| return true; |
| } |
| |
| |
| uc32 RegExpParser::ParseClassCharacterEscape() { |
| DCHECK_EQ('\\', current()); |
| DCHECK(has_next() && !IsSpecialClassEscape(Next())); |
| Advance(); |
| switch (current()) { |
| case 'b': |
| Advance(); |
| return '\b'; |
| // ControlEscape :: one of |
| // f n r t v |
| case 'f': |
| Advance(); |
| return '\f'; |
| case 'n': |
| Advance(); |
| return '\n'; |
| case 'r': |
| Advance(); |
| return '\r'; |
| case 't': |
| Advance(); |
| return '\t'; |
| case 'v': |
| Advance(); |
| return '\v'; |
| case 'c': { |
| uc32 controlLetter = Next(); |
| uc32 letter = controlLetter & ~('A' ^ 'a'); |
| // Inside a character class, we also accept digits and underscore as |
| // control characters, unless with /u. See Annex B: |
| // ES#prod-annexB-ClassControlLetter |
| if (letter >= 'A' && letter <= 'Z') { |
| Advance(2); |
| // Control letters mapped to ASCII control characters in the range |
| // 0x00-0x1F. |
| return controlLetter & 0x1F; |
| } |
| if (unicode()) { |
| // With /u, invalid escapes are not treated as identity escapes. |
| ReportError(CStrVector("Invalid class escape")); |
| return 0; |
| } |
| if ((controlLetter >= '0' && controlLetter <= '9') || |
| controlLetter == '_') { |
| Advance(2); |
| return controlLetter & 0x1F; |
| } |
| // We match JSC in reading the backslash as a literal |
| // character instead of as starting an escape. |
| // TODO(v8:6201): Not yet covered by the spec. |
| return '\\'; |
| } |
| case '0': |
| // With /u, \0 is interpreted as NUL if not followed by another digit. |
| if (unicode() && !(Next() >= '0' && Next() <= '9')) { |
| Advance(); |
| return 0; |
| } |
| // Fall through. |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| // For compatibility, we interpret a decimal escape that isn't |
| // a back reference (and therefore either \0 or not valid according |
| // to the specification) as a 1..3 digit octal character code. |
| // ES#prod-annexB-LegacyOctalEscapeSequence |
| if (unicode()) { |
| // With /u, decimal escape is not interpreted as octal character code. |
| ReportError(CStrVector("Invalid class escape")); |
| return 0; |
| } |
| return ParseOctalLiteral(); |
| case 'x': { |
| Advance(); |
| uc32 value; |
| if (ParseHexEscape(2, &value)) return value; |
| if (unicode()) { |
| // With /u, invalid escapes are not treated as identity escapes. |
| ReportError(CStrVector("Invalid escape")); |
| return 0; |
| } |
| // If \x is not followed by a two-digit hexadecimal, treat it |
| // as an identity escape. |
| return 'x'; |
| } |
| case 'u': { |
| Advance(); |
| uc32 value; |
| if (ParseUnicodeEscape(&value)) return value; |
| if (unicode()) { |
| // With /u, invalid escapes are not treated as identity escapes. |
| ReportError(CStrVector("Invalid unicode escape")); |
| return 0; |
| } |
| // If \u is not followed by a two-digit hexadecimal, treat it |
| // as an identity escape. |
| return 'u'; |
| } |
| default: { |
| uc32 result = current(); |
| // With /u, no identity escapes except for syntax characters and '-' are |
| // allowed. Otherwise, all identity escapes are allowed. |
| if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { |
| Advance(); |
| return result; |
| } |
| ReportError(CStrVector("Invalid escape")); |
| return 0; |
| } |
| } |
| return 0; |
| } |
| |
| void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, |
| Zone* zone, |
| bool add_unicode_case_equivalents, |
| uc32* char_out, bool* is_class_escape) { |
| uc32 current_char = current(); |
| if (current_char == '\\') { |
| switch (Next()) { |
| case 'w': |
| case 'W': |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': { |
| CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges, |
| add_unicode_case_equivalents, zone); |
| Advance(2); |
| *is_class_escape = true; |
| return; |
| } |
| case kEndMarker: |
| ReportError(CStrVector("\\ at end of pattern")); |
| return; |
| case 'p': |
| case 'P': |
| if (FLAG_harmony_regexp_property && unicode()) { |
| bool negate = Next() == 'P'; |
| Advance(2); |
| if (!ParsePropertyClass(ranges, negate)) { |
| ReportError(CStrVector("Invalid property name in character class")); |
| } |
| *is_class_escape = true; |
| return; |
| } |
| break; |
| default: |
| break; |
| } |
| *char_out = ParseClassCharacterEscape(); |
| *is_class_escape = false; |
| } else { |
| Advance(); |
| *char_out = current_char; |
| *is_class_escape = false; |
| } |
| } |
| |
| RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { |
| static const char* kUnterminated = "Unterminated character class"; |
| static const char* kRangeInvalid = "Invalid character class"; |
| static const char* kRangeOutOfOrder = "Range out of order in character class"; |
| |
| DCHECK_EQ(current(), '['); |
| Advance(); |
| bool is_negated = false; |
| if (current() == '^') { |
| is_negated = true; |
| Advance(); |
| } |
| ZoneList<CharacterRange>* ranges = |
| new (zone()) ZoneList<CharacterRange>(2, zone()); |
| bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); |
| while (has_more() && current() != ']') { |
| uc32 char_1, char_2; |
| bool is_class_1, is_class_2; |
| ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, |
| &is_class_1 CHECK_FAILED); |
| if (current() == '-') { |
| Advance(); |
| if (current() == kEndMarker) { |
| // If we reach the end we break out of the loop and let the |
| // following code report an error. |
| break; |
| } else if (current() == ']') { |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| ranges->Add(CharacterRange::Singleton('-'), zone()); |
| break; |
| } |
| ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, |
| &is_class_2 CHECK_FAILED); |
| if (is_class_1 || is_class_2) { |
| // Either end is an escaped character class. Treat the '-' verbatim. |
| if (unicode()) { |
| // ES2015 21.2.2.15.1 step 1. |
| return ReportError(CStrVector(kRangeInvalid)); |
| } |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| ranges->Add(CharacterRange::Singleton('-'), zone()); |
| if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); |
| continue; |
| } |
| // ES2015 21.2.2.15.1 step 6. |
| if (char_1 > char_2) { |
| return ReportError(CStrVector(kRangeOutOfOrder)); |
| } |
| ranges->Add(CharacterRange::Range(char_1, char_2), zone()); |
| } else { |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| } |
| } |
| if (!has_more()) { |
| return ReportError(CStrVector(kUnterminated)); |
| } |
| Advance(); |
| RegExpCharacterClass::CharacterClassFlags character_class_flags; |
| if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; |
| return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(), |
| character_class_flags); |
| } |
| |
| |
| #undef CHECK_FAILED |
| |
| |
| bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
| FlatStringReader* input, JSRegExp::Flags flags, |
| RegExpCompileData* result) { |
| DCHECK(result != nullptr); |
| RegExpParser parser(input, &result->error, flags, isolate, zone); |
| RegExpTree* tree = parser.ParsePattern(); |
| if (parser.failed()) { |
| DCHECK(tree == nullptr); |
| DCHECK(!result->error.is_null()); |
| } else { |
| DCHECK(tree != nullptr); |
| DCHECK(result->error.is_null()); |
| if (FLAG_trace_regexp_parser) { |
| OFStream os(stdout); |
| tree->Print(os, zone); |
| os << "\n"; |
| } |
| result->tree = tree; |
| int capture_count = parser.captures_started(); |
| result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
| result->contains_anchor = parser.contains_anchor(); |
| result->capture_name_map = parser.CreateCaptureNameMap(); |
| result->capture_count = capture_count; |
| } |
| return !parser.failed(); |
| } |
| |
| RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) |
| : zone_(zone), |
| pending_empty_(false), |
| flags_(flags), |
| characters_(nullptr), |
| pending_surrogate_(kNoPendingSurrogate), |
| terms_(), |
| alternatives_() |
| #ifdef DEBUG |
| , |
| last_added_(ADD_NONE) |
| #endif |
| { |
| } |
| |
| |
| void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { |
| DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| FlushPendingSurrogate(); |
| // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
| pending_surrogate_ = lead_surrogate; |
| } |
| |
| |
| void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
| DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| uc16 lead_surrogate = pending_surrogate_; |
| pending_surrogate_ = kNoPendingSurrogate; |
| DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| uc32 combined = |
| unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
| if (NeedsDesugaringForIgnoreCase(combined)) { |
| AddCharacterClassForDesugaring(combined); |
| } else { |
| ZoneList<uc16> surrogate_pair(2, zone()); |
| surrogate_pair.Add(lead_surrogate, zone()); |
| surrogate_pair.Add(trail_surrogate, zone()); |
| RegExpAtom* atom = |
| new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_); |
| AddAtom(atom); |
| } |
| } else { |
| pending_surrogate_ = trail_surrogate; |
| FlushPendingSurrogate(); |
| } |
| } |
| |
| |
| void RegExpBuilder::FlushPendingSurrogate() { |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| DCHECK(unicode()); |
| uc32 c = pending_surrogate_; |
| pending_surrogate_ = kNoPendingSurrogate; |
| AddCharacterClassForDesugaring(c); |
| } |
| } |
| |
| |
| void RegExpBuilder::FlushCharacters() { |
| FlushPendingSurrogate(); |
| pending_empty_ = false; |
| if (characters_ != nullptr) { |
| RegExpTree* atom = |
| new (zone()) RegExpAtom(characters_->ToConstVector(), flags_); |
| characters_ = nullptr; |
| text_.Add(atom, zone()); |
| LAST(ADD_ATOM); |
| } |
| } |
| |
| |
| void RegExpBuilder::FlushText() { |
| FlushCharacters(); |
| int num_text = text_.length(); |
| if (num_text == 0) { |
| return; |
| } else if (num_text == 1) { |
| terms_.Add(text_.last(), zone()); |
| } else { |
| RegExpText* text = new (zone()) RegExpText(zone()); |
| for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
| terms_.Add(text, zone()); |
| } |
| text_.Clear(); |
| } |
| |
| |
| void RegExpBuilder::AddCharacter(uc16 c) { |
| FlushPendingSurrogate(); |
| pending_empty_ = false; |
| if (NeedsDesugaringForIgnoreCase(c)) { |
| AddCharacterClassForDesugaring(c); |
| } else { |
| if (characters_ == nullptr) { |
| characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
| } |
| characters_->Add(c, zone()); |
| LAST(ADD_CHAR); |
| } |
| } |
| |
| |
| void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
| if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { |
| DCHECK(unicode()); |
| AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
| AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
| } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
| AddLeadSurrogate(c); |
| } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| AddTrailSurrogate(c); |
| } else { |
| AddCharacter(static_cast<uc16>(c)); |
| } |
| } |
| |
| void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { |
| // A lead or trail surrogate parsed via escape sequence will not |
| // pair up with any preceding lead or following trail surrogate. |
| FlushPendingSurrogate(); |
| AddUnicodeCharacter(character); |
| FlushPendingSurrogate(); |
| } |
| |
| void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| |
| |
| void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| if (NeedsDesugaringForUnicode(cc)) { |
| // With /u, character class needs to be desugared, so it |
| // must be a standalone term instead of being part of a RegExpText. |
| AddTerm(cc); |
| } else { |
| AddAtom(cc); |
| } |
| } |
| |
| void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { |
| AddTerm(new (zone()) RegExpCharacterClass( |
| zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)), |
| flags_)); |
| } |
| |
| |
| void RegExpBuilder::AddAtom(RegExpTree* term) { |
| if (term->IsEmpty()) { |
| AddEmpty(); |
| return; |
| } |
| if (term->IsTextElement()) { |
| FlushCharacters(); |
| text_.Add(term, zone()); |
| } else { |
| FlushText(); |
| terms_.Add(term, zone()); |
| } |
| LAST(ADD_ATOM); |
| } |
| |
| |
| void RegExpBuilder::AddTerm(RegExpTree* term) { |
| FlushText(); |
| terms_.Add(term, zone()); |
| LAST(ADD_ATOM); |
| } |
| |
| |
| void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
| FlushText(); |
| if (terms_.length() > 0 && terms_.last()->IsAssertion()) { |
| // Omit repeated assertions of the same type. |
| RegExpAssertion* last = terms_.last()->AsAssertion(); |
| RegExpAssertion* next = assert->AsAssertion(); |
| if (last->assertion_type() == next->assertion_type()) return; |
| } |
| terms_.Add(assert, zone()); |
| LAST(ADD_ASSERT); |
| } |
| |
| |
| void RegExpBuilder::NewAlternative() { FlushTerms(); } |
| |
| |
| void RegExpBuilder::FlushTerms() { |
| FlushText(); |
| int num_terms = terms_.length(); |
| RegExpTree* alternative; |
| if (num_terms == 0) { |
| alternative = new (zone()) RegExpEmpty(); |
| } else if (num_terms == 1) { |
| alternative = terms_.last(); |
| } else { |
| alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); |
| } |
| alternatives_.Add(alternative, zone()); |
| terms_.Clear(); |
| LAST(ADD_NONE); |
| } |
| |
| |
| bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { |
| if (!unicode()) return false; |
| // TODO(yangguo): we could be smarter than this. Case-insensitivity does not |
| // necessarily mean that we need to desugar. It's probably nicer to have a |
| // separate pass to figure out unicode desugarings. |
| if (ignore_case()) return true; |
| ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| CharacterRange::Canonicalize(ranges); |
| for (int i = ranges->length() - 1; i >= 0; i--) { |
| uc32 from = ranges->at(i).from(); |
| uc32 to = ranges->at(i).to(); |
| // Check for non-BMP characters. |
| if (to >= kNonBmpStart) return true; |
| // Check for lone surrogates. |
| if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
| } |
| return false; |
| } |
| |
| |
| bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { |
| #ifdef V8_INTL_SUPPORT |
| if (unicode() && ignore_case()) { |
| icu::UnicodeSet set(c, c); |
| set.closeOver(USET_CASE_INSENSITIVE); |
| set.removeAllStrings(); |
| return set.size() > 1; |
| } |
| // In the case where ICU is not included, we act as if the unicode flag is |
| // not set, and do not desugar. |
| #endif // V8_INTL_SUPPORT |
| return false; |
| } |
| |
| |
| RegExpTree* RegExpBuilder::ToRegExp() { |
| FlushTerms(); |
| int num_alternatives = alternatives_.length(); |
| if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
| if (num_alternatives == 1) return alternatives_.last(); |
| return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
| } |
| |
| bool RegExpBuilder::AddQuantifierToAtom( |
| int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
| FlushPendingSurrogate(); |
| if (pending_empty_) { |
| pending_empty_ = false; |
| return true; |
| } |
| RegExpTree* atom; |
| if (characters_ != nullptr) { |
| DCHECK(last_added_ == ADD_CHAR); |
| // Last atom was character. |
| Vector<const uc16> char_vector = characters_->ToConstVector(); |
| int num_chars = char_vector.length(); |
| if (num_chars > 1) { |
| Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); |
| text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone()); |
| char_vector = char_vector.SubVector(num_chars - 1, num_chars); |
| } |
| characters_ = nullptr; |
| atom = new (zone()) RegExpAtom(char_vector, flags_); |
| FlushText(); |
| } else if (text_.length() > 0) { |
| DCHECK(last_added_ == ADD_ATOM); |
| atom = text_.RemoveLast(); |
| FlushText(); |
| } else if (terms_.length() > 0) { |
| DCHECK(last_added_ == ADD_ATOM); |
| atom = terms_.RemoveLast(); |
| // With /u, lookarounds are not quantifiable. |
| if (unicode() && atom->IsLookaround()) return false; |
| if (atom->max_match() == 0) { |
| // Guaranteed to only match an empty string. |
| LAST(ADD_TERM); |
| if (min == 0) { |
| return true; |
| } |
| terms_.Add(atom, zone()); |
| return true; |
| } |
| } else { |
| // Only call immediately after adding an atom or character! |
| UNREACHABLE(); |
| } |
| terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| zone()); |
| LAST(ADD_TERM); |
| return true; |
| } |
| |
| } // namespace internal |
| } // namespace v8 |