| /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
| * vim: set ts=8 sts=4 et sw=4 tw=99: */ |
| |
| // Copyright 2012 the V8 project authors. All rights reserved. |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following |
| // disclaimer in the documentation and/or other materials provided |
| // with the distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived |
| // from this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #ifndef V8_PARSER_H_ |
| #define V8_PARSER_H_ |
| |
| #include "irregexp/RegExpAST.h" |
| |
| namespace js { |
| |
| namespace frontend { |
| class TokenStream; |
| } |
| |
| namespace irregexp { |
| |
| bool |
| ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, |
| bool multiline, bool match_only, |
| RegExpCompileData* data); |
| |
| bool |
| ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str); |
| |
| // A BufferedVector is an automatically growing list, just like (and backed |
| // by) a Vector, that is optimized for the case of adding and removing |
| // a single element. The last element added is stored outside the backing list, |
| // and if no more than one element is ever added, the ZoneList isn't even |
| // allocated. |
| // Elements must not be nullptr pointers. |
| template <typename T, int initial_size> |
| class BufferedVector |
| { |
| public: |
| typedef Vector<T*, 1, LifoAllocPolicy<Infallible> > VectorType; |
| |
| BufferedVector() : list_(nullptr), last_(nullptr) {} |
| |
| // Adds element at end of list. This element is buffered and can |
| // be read using last() or removed using RemoveLast until a new Add or until |
| // RemoveLast or GetList has been called. |
| void Add(LifoAlloc* alloc, T* value) { |
| if (last_ != nullptr) { |
| if (list_ == nullptr) { |
| list_ = alloc->newInfallible<VectorType>(*alloc); |
| list_->reserve(initial_size); |
| } |
| list_->append(last_); |
| } |
| last_ = value; |
| } |
| |
| T* last() { |
| MOZ_ASSERT(last_ != nullptr); |
| return last_; |
| } |
| |
| T* RemoveLast() { |
| MOZ_ASSERT(last_ != nullptr); |
| T* result = last_; |
| if ((list_ != nullptr) && (list_->length() > 0)) |
| last_ = list_->popCopy(); |
| else |
| last_ = nullptr; |
| return result; |
| } |
| |
| T* Get(int i) { |
| MOZ_ASSERT((0 <= i) && (i < length())); |
| if (list_ == nullptr) { |
| MOZ_ASSERT(0 == i); |
| return last_; |
| } else { |
| if (size_t(i) == list_->length()) { |
| MOZ_ASSERT(last_ != nullptr); |
| return last_; |
| } else { |
| return (*list_)[i]; |
| } |
| } |
| } |
| |
| void Clear() { |
| list_ = nullptr; |
| last_ = nullptr; |
| } |
| |
| int length() { |
| int length = (list_ == nullptr) ? 0 : list_->length(); |
| return length + ((last_ == nullptr) ? 0 : 1); |
| } |
| |
| VectorType* GetList(LifoAlloc* alloc) { |
| if (list_ == nullptr) |
| list_ = alloc->newInfallible<VectorType>(*alloc); |
| if (last_ != nullptr) { |
| list_->append(last_); |
| last_ = nullptr; |
| } |
| return list_; |
| } |
| |
| private: |
| VectorType* list_; |
| T* last_; |
| }; |
| |
| |
| // Accumulates RegExp atoms and assertions into lists of terms and alternatives. |
| class RegExpBuilder |
| { |
| public: |
| explicit RegExpBuilder(LifoAlloc* alloc); |
| void AddCharacter(char16_t character); |
| // "Adds" an empty expression. Does nothing except consume a |
| // following quantifier |
| void AddEmpty(); |
| void AddAtom(RegExpTree* tree); |
| void AddAssertion(RegExpTree* tree); |
| void NewAlternative(); // '|' |
| void AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType type); |
| RegExpTree* ToRegExp(); |
| |
| private: |
| void FlushCharacters(); |
| void FlushText(); |
| void FlushTerms(); |
| |
| LifoAlloc* alloc; |
| bool pending_empty_; |
| CharacterVector* characters_; |
| BufferedVector<RegExpTree, 2> terms_; |
| BufferedVector<RegExpTree, 2> text_; |
| BufferedVector<RegExpTree, 2> alternatives_; |
| |
| enum LastAdded { |
| ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM |
| }; |
| mozilla::DebugOnly<LastAdded> last_added_; |
| }; |
| |
| // Characters parsed by RegExpParser can be either char16_t or kEndMarker. |
| typedef uint32_t widechar; |
| |
| template <typename CharT> |
| class RegExpParser |
| { |
| public: |
| RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, |
| const CharT* chars, const CharT* end, bool multiline_mode); |
| |
| RegExpTree* ParsePattern(); |
| RegExpTree* ParseDisjunction(); |
| RegExpTree* ParseCharacterClass(); |
| |
| // Parses a {...,...} quantifier and stores the range in the given |
| // out parameters. |
| bool ParseIntervalQuantifier(int* min_out, int* max_out); |
| |
| // Parses and returns a single escaped character. The character |
| // must not be 'b' or 'B' since they are usually handled specially. |
| widechar ParseClassCharacterEscape(); |
| |
| // Checks whether the following is a length-digit hexadecimal number, |
| // and sets the value if it is. |
| bool ParseHexEscape(int length, size_t* value); |
| |
| size_t ParseOctalLiteral(); |
| |
| // Tries to parse the input as a back reference. If successful it |
| // stores the result in the output parameter and returns true. If |
| // it fails it will push back the characters read so the same characters |
| // can be reparsed. |
| bool ParseBackReferenceIndex(int* index_out); |
| |
| bool ParseClassAtom(char16_t* char_class, CharacterRange* char_range); |
| RegExpTree* ReportError(unsigned errorNumber); |
| void Advance(); |
| void Advance(int dist) { |
| next_pos_ += dist - 1; |
| Advance(); |
| } |
| |
| void Reset(const CharT* pos) { |
| next_pos_ = pos; |
| has_more_ = (pos < end_); |
| Advance(); |
| } |
| |
| // Reports whether the pattern might be used as a literal search string. |
| // Only use if the result of the parse is a single atom node. |
| bool simple() { return simple_; } |
| bool contains_anchor() { return contains_anchor_; } |
| void set_contains_anchor() { contains_anchor_ = true; } |
| int captures_started() { return captures_ == nullptr ? 0 : captures_->length(); } |
| const CharT* position() { return next_pos_ - 1; } |
| |
| static const int kMaxCaptures = 1 << 16; |
| static const widechar kEndMarker = (1 << 21); |
| |
| private: |
| enum SubexpressionType { |
| INITIAL, |
| CAPTURE, // All positive values represent captures. |
| POSITIVE_LOOKAHEAD, |
| NEGATIVE_LOOKAHEAD, |
| GROUPING |
| }; |
| |
| class RegExpParserState { |
| public: |
| RegExpParserState(LifoAlloc* alloc, |
| RegExpParserState* previous_state, |
| SubexpressionType group_type, |
| int disjunction_capture_index) |
| : previous_state_(previous_state), |
| builder_(alloc->newInfallible<RegExpBuilder>(alloc)), |
| group_type_(group_type), |
| disjunction_capture_index_(disjunction_capture_index) |
| {} |
| // Parser state of containing expression, if any. |
| RegExpParserState* previous_state() { return previous_state_; } |
| bool IsSubexpression() { return previous_state_ != nullptr; } |
| // RegExpBuilder building this regexp's AST. |
| RegExpBuilder* builder() { return builder_; } |
| // Type of regexp being parsed (parenthesized group or entire regexp). |
| SubexpressionType group_type() { return group_type_; } |
| // Index in captures array of first capture in this sub-expression, if any. |
| // Also the capture index of this sub-expression itself, if group_type |
| // is CAPTURE. |
| int capture_index() { return disjunction_capture_index_; } |
| |
| private: |
| // Linked list implementation of stack of states. |
| RegExpParserState* previous_state_; |
| // Builder for the stored disjunction. |
| RegExpBuilder* builder_; |
| // Stored disjunction type (capture, look-ahead or grouping), if any. |
| SubexpressionType group_type_; |
| // Stored disjunction's capture index (if any). |
| int disjunction_capture_index_; |
| }; |
| |
| widechar current() { return current_; } |
| bool has_more() { return has_more_; } |
| bool has_next() { return next_pos_ < end_; } |
| widechar Next() { |
| if (has_next()) |
| return *next_pos_; |
| return kEndMarker; |
| } |
| void ScanForCaptures(); |
| |
| frontend::TokenStream& ts; |
| LifoAlloc* alloc; |
| RegExpCaptureVector* captures_; |
| const CharT* next_pos_; |
| const CharT* end_; |
| widechar current_; |
| // The capture count is only valid after we have scanned for captures. |
| int capture_count_; |
| bool has_more_; |
| bool multiline_; |
| bool simple_; |
| bool contains_anchor_; |
| bool is_scanned_for_captures_; |
| }; |
| |
| } } // namespace js::irregexp |
| |
| #endif // V8_PARSER_H_ |