| // Copyright 2018 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef V8_TORQUE_EARLEY_PARSER_H_ |
| #define V8_TORQUE_EARLEY_PARSER_H_ |
| |
| #include <map> |
| #include <memory> |
| #include <vector> |
| |
| #include "src/base/optional.h" |
| #include "src/torque/contextual.h" |
| #include "src/torque/source-positions.h" |
| #include "src/torque/utils.h" |
| |
| namespace v8 { |
| namespace internal { |
| namespace torque { |
| |
| class Symbol; |
| class Item; |
| |
| class ParseResultHolderBase { |
| public: |
| enum class TypeId; |
| virtual ~ParseResultHolderBase() = default; |
| template <class T> |
| T& Cast(); |
| template <class T> |
| const T& Cast() const; |
| |
| protected: |
| explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) { |
| // MSVC wrongly complains about type_id_ being an unused private field. |
| USE(type_id_); |
| } |
| |
| private: |
| const TypeId type_id_; |
| }; |
| |
| enum class ParseResultHolderBase::TypeId { |
| kStdString, |
| kBool, |
| kInt32, |
| kStdVectorOfString, |
| kExpressionPtr, |
| kIdentifierPtr, |
| kOptionalIdentifierPtr, |
| kStatementPtr, |
| kDeclarationPtr, |
| kTypeExpressionPtr, |
| kOptionalTypeExpressionPtr, |
| kTryHandlerPtr, |
| kNameAndTypeExpression, |
| kEnumEntry, |
| kStdVectorOfEnumEntry, |
| kImplicitParameters, |
| kOptionalImplicitParameters, |
| kNameAndExpression, |
| kAnnotation, |
| kVectorOfAnnotation, |
| kAnnotationParameter, |
| kOptionalAnnotationParameter, |
| kClassFieldExpression, |
| kStructFieldExpression, |
| kBitFieldDeclaration, |
| kStdVectorOfNameAndTypeExpression, |
| kStdVectorOfNameAndExpression, |
| kStdVectorOfClassFieldExpression, |
| kStdVectorOfStructFieldExpression, |
| kStdVectorOfBitFieldDeclaration, |
| kIncrementDecrementOperator, |
| kOptionalStdString, |
| kStdVectorOfStatementPtr, |
| kStdVectorOfDeclarationPtr, |
| kStdVectorOfStdVectorOfDeclarationPtr, |
| kStdVectorOfExpressionPtr, |
| kExpressionWithSource, |
| kParameterList, |
| kTypeList, |
| kOptionalTypeList, |
| kLabelAndTypes, |
| kStdVectorOfLabelAndTypes, |
| kStdVectorOfTryHandlerPtr, |
| kOptionalStatementPtr, |
| kOptionalExpressionPtr, |
| kTypeswitchCase, |
| kStdVectorOfTypeswitchCase, |
| kStdVectorOfIdentifierPtr, |
| kOptionalClassBody, |
| kGenericParameter, |
| kGenericParameters, |
| |
| kJsonValue, |
| kJsonMember, |
| kStdVectorOfJsonValue, |
| kStdVectorOfJsonMember, |
| }; |
| |
| using ParseResultTypeId = ParseResultHolderBase::TypeId; |
| |
| template <class T> |
| class ParseResultHolder : public ParseResultHolderBase { |
| public: |
| explicit ParseResultHolder(T value) |
| : ParseResultHolderBase(id), value_(std::move(value)) {} |
| |
| private: |
| V8_EXPORT_PRIVATE static const TypeId id; |
| friend class ParseResultHolderBase; |
| T value_; |
| }; |
| |
| template <class T> |
| T& ParseResultHolderBase::Cast() { |
| CHECK_EQ(ParseResultHolder<T>::id, type_id_); |
| return static_cast<ParseResultHolder<T>*>(this)->value_; |
| } |
| |
| template <class T> |
| const T& ParseResultHolderBase::Cast() const { |
| CHECK_EQ(ParseResultHolder<T>::id, type_id_); |
| return static_cast<const ParseResultHolder<T>*>(this)->value_; |
| } |
| |
| class ParseResult { |
| public: |
| template <class T> |
| explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {} |
| |
| template <class T> |
| const T& Cast() const& { |
| return value_->Cast<T>(); |
| } |
| template <class T> |
| T& Cast() & { |
| return value_->Cast<T>(); |
| } |
| template <class T> |
| T&& Cast() && { |
| return std::move(value_->Cast<T>()); |
| } |
| |
| private: |
| std::unique_ptr<ParseResultHolderBase> value_; |
| }; |
| |
| using InputPosition = const char*; |
| |
| struct MatchedInput { |
| MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos) |
| : begin(begin), end(end), pos(pos) {} |
| InputPosition begin; |
| InputPosition end; |
| SourcePosition pos; |
| std::string ToString() const { return {begin, end}; } |
| }; |
| |
| class ParseResultIterator { |
| public: |
| explicit ParseResultIterator(std::vector<ParseResult> results, |
| MatchedInput matched_input) |
| : results_(std::move(results)), matched_input_(matched_input) {} |
| ~ParseResultIterator() { |
| // Check that all parse results have been used. |
| CHECK_EQ(results_.size(), i_); |
| } |
| |
| ParseResult Next() { |
| CHECK_LT(i_, results_.size()); |
| return std::move(results_[i_++]); |
| } |
| template <class T> |
| T NextAs() { |
| return std::move(Next().Cast<T>()); |
| } |
| bool HasNext() const { return i_ < results_.size(); } |
| |
| const MatchedInput& matched_input() const { return matched_input_; } |
| |
| private: |
| std::vector<ParseResult> results_; |
| size_t i_ = 0; |
| MatchedInput matched_input_; |
| |
| DISALLOW_COPY_AND_ASSIGN(ParseResultIterator); |
| }; |
| |
| struct LexerResult { |
| std::vector<Symbol*> token_symbols; |
| std::vector<MatchedInput> token_contents; |
| }; |
| |
| using Action = |
| base::Optional<ParseResult> (*)(ParseResultIterator* child_results); |
| |
| inline base::Optional<ParseResult> DefaultAction( |
| ParseResultIterator* child_results) { |
| if (!child_results->HasNext()) return base::nullopt; |
| return child_results->Next(); |
| } |
| |
| template <class T, Action action> |
| inline Action AsSingletonVector() { |
| return [](ParseResultIterator* child_results) -> base::Optional<ParseResult> { |
| auto result = action(child_results); |
| if (!result) return result; |
| return ParseResult{std::vector<T>{(*result).Cast<T>()}}; |
| }; |
| } |
| |
| // A rule of the context-free grammar. Each rule can have an action attached to |
| // it, which is executed after the parsing is finished. |
| class Rule final { |
| public: |
| explicit Rule(std::vector<Symbol*> right_hand_side, |
| Action action = DefaultAction) |
| : right_hand_side_(std::move(right_hand_side)), action_(action) {} |
| |
| Symbol* left() const { |
| DCHECK_NOT_NULL(left_hand_side_); |
| return left_hand_side_; |
| } |
| const std::vector<Symbol*>& right() const { return right_hand_side_; } |
| |
| void SetLeftHandSide(Symbol* left_hand_side) { |
| DCHECK_NULL(left_hand_side_); |
| left_hand_side_ = left_hand_side; |
| } |
| |
| V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction( |
| const Item* completed_item, const LexerResult& tokens) const; |
| |
| private: |
| Symbol* left_hand_side_ = nullptr; |
| std::vector<Symbol*> right_hand_side_; |
| Action action_; |
| }; |
| |
| // A Symbol represents a terminal or a non-terminal of the grammar. |
| // It stores the list of rules, which have this symbol as the |
| // left-hand side. |
| // Terminals have an empty list of rules, they are created by the Lexer |
| // instead of from rules. |
| // Symbols need to reside at stable memory addresses, because the addresses are |
| // used in the parser. |
| class Symbol { |
| public: |
| Symbol() : Symbol({}) {} |
| Symbol(std::initializer_list<Rule> rules) { *this = rules; } |
| |
| V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules); |
| |
| bool IsTerminal() const { return rules_.empty(); } |
| Rule* rule(size_t index) const { return rules_[index].get(); } |
| size_t rule_number() const { return rules_.size(); } |
| |
| void AddRule(const Rule& rule) { |
| rules_.push_back(std::make_unique<Rule>(rule)); |
| rules_.back()->SetLeftHandSide(this); |
| } |
| |
| V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction( |
| const Item* item, const LexerResult& tokens); |
| |
| private: |
| std::vector<std::unique_ptr<Rule>> rules_; |
| |
| // Disallow copying and moving to ensure Symbol has a stable address. |
| DISALLOW_COPY_AND_ASSIGN(Symbol); |
| }; |
| |
| // Items are the core datastructure of Earley's algorithm. |
| // They consist of a (partially) matched rule, a marked position inside of the |
| // right-hand side of the rule (traditionally written as a dot) and an input |
| // range from {start} to {pos} that matches the symbols of the right-hand side |
| // that are left of the mark. In addition, they store a child and a left-sibling |
| // pointer to reconstruct the AST in the end. |
| class Item { |
| public: |
| Item(const Rule* rule, size_t mark, size_t start, size_t pos) |
| : rule_(rule), mark_(mark), start_(start), pos_(pos) { |
| DCHECK_LE(mark_, right().size()); |
| } |
| |
| // A complete item has the mark at the right end, which means the input range |
| // matches the complete rule. |
| bool IsComplete() const { |
| DCHECK_LE(mark_, right().size()); |
| return mark_ == right().size(); |
| } |
| |
| // The symbol right after the mark is expected at {pos} for this item to |
| // advance. |
| Symbol* NextSymbol() const { |
| DCHECK(!IsComplete()); |
| DCHECK_LT(mark_, right().size()); |
| return right()[mark_]; |
| } |
| |
| // We successfully parsed NextSymbol() between {pos} and {new_pos}. |
| // If NextSymbol() was a non-terminal, then {child} is a pointer to a |
| // completed item for this parse. |
| // We create a new item, which moves the mark one forward. |
| Item Advance(size_t new_pos, const Item* child = nullptr) const { |
| if (child) { |
| DCHECK(child->IsComplete()); |
| DCHECK_EQ(pos(), child->start()); |
| DCHECK_EQ(new_pos, child->pos()); |
| DCHECK_EQ(NextSymbol(), child->left()); |
| } |
| Item result(rule_, mark_ + 1, start_, new_pos); |
| result.prev_ = this; |
| result.child_ = child; |
| return result; |
| } |
| |
| // Collect the items representing the AST children of this completed item. |
| std::vector<const Item*> Children() const; |
| // The matched input separated according to the next branching AST level. |
| std::string SplitByChildren(const LexerResult& tokens) const; |
| // Check if {other} results in the same AST as this Item. |
| void CheckAmbiguity(const Item& other, const LexerResult& tokens) const; |
| |
| MatchedInput GetMatchedInput(const LexerResult& tokens) const { |
| const MatchedInput& start = tokens.token_contents[start_]; |
| const MatchedInput& end = start_ == pos_ ? tokens.token_contents[start_] |
| : tokens.token_contents[pos_ - 1]; |
| CHECK_EQ(start.pos.source, end.pos.source); |
| SourcePosition combined{start.pos.source, start.pos.start, end.pos.end}; |
| |
| return {start.begin, end.end, combined}; |
| } |
| |
| // We exclude {prev_} and {child_} from equality and hash computations, |
| // because they are just globally unique data associated with an item. |
| bool operator==(const Item& other) const { |
| return rule_ == other.rule_ && mark_ == other.mark_ && |
| start_ == other.start_ && pos_ == other.pos_; |
| } |
| |
| friend size_t hash_value(const Item& i) { |
| return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_); |
| } |
| |
| const Rule* rule() const { return rule_; } |
| Symbol* left() const { return rule_->left(); } |
| const std::vector<Symbol*>& right() const { return rule_->right(); } |
| size_t pos() const { return pos_; } |
| size_t start() const { return start_; } |
| |
| private: |
| const Rule* rule_; |
| size_t mark_; |
| size_t start_; |
| size_t pos_; |
| |
| const Item* prev_ = nullptr; |
| const Item* child_ = nullptr; |
| }; |
| |
| inline base::Optional<ParseResult> Symbol::RunAction( |
| const Item* item, const LexerResult& tokens) { |
| DCHECK(item->IsComplete()); |
| DCHECK_EQ(item->left(), this); |
| return item->rule()->RunAction(item, tokens); |
| } |
| |
| V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm( |
| Symbol* start, const LexerResult& tokens, |
| std::unordered_set<Item, base::hash<Item>>* processed); |
| |
| inline base::Optional<ParseResult> ParseTokens(Symbol* start, |
| const LexerResult& tokens) { |
| std::unordered_set<Item, base::hash<Item>> table; |
| const Item* final_item = RunEarleyAlgorithm(start, tokens, &table); |
| return start->RunAction(final_item, tokens); |
| } |
| |
| // The lexical syntax is dynamically defined while building the grammar by |
| // adding patterns and keywords to the Lexer. |
| // The term keyword here can stand for any fixed character sequence, including |
| // operators and parentheses. |
| // Each pattern or keyword automatically gets a terminal symbol associated with |
| // it. These symbols form the result of the lexing. |
| // Patterns and keywords are matched using the longest match principle. If the |
| // longest matching pattern coincides with a keyword, the keyword symbol is |
| // chosen instead of the pattern. |
| // In addition, there is a single whitespace pattern which is consumed but does |
| // not become part of the token list. |
| class Lexer { |
| public: |
| // Functions to define patterns. They try to match starting from {pos}. If |
| // successful, they return true and advance {pos}. Otherwise, {pos} stays |
| // unchanged. |
| using PatternFunction = bool (*)(InputPosition* pos); |
| |
| void SetWhitespace(PatternFunction whitespace) { |
| match_whitespace_ = whitespace; |
| } |
| |
| Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; } |
| Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; } |
| V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input); |
| |
| private: |
| PatternFunction match_whitespace_ = [](InputPosition*) { return false; }; |
| std::map<PatternFunction, Symbol> patterns_; |
| std::map<std::string, Symbol> keywords_; |
| Symbol* MatchToken(InputPosition* pos, InputPosition end); |
| }; |
| |
| // A grammar can have a result, which is the results of the start symbol. |
| // Grammar is intended to be subclassed, with Symbol members forming the |
| // mutually recursive rules of the grammar. |
| class Grammar { |
| public: |
| using PatternFunction = Lexer::PatternFunction; |
| |
| explicit Grammar(Symbol* start) : start_(start) {} |
| |
| base::Optional<ParseResult> Parse(const std::string& input) { |
| LexerResult tokens = lexer().RunLexer(input); |
| return ParseTokens(start_, tokens); |
| } |
| |
| protected: |
| Symbol* Token(const std::string& s) { return lexer_.Token(s); } |
| Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); } |
| void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); } |
| |
| // NewSymbol() allocates a fresh symbol and stores it in the current grammar. |
| // This is necessary to define helpers that create new symbols. |
| Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) { |
| auto symbol = std::make_unique<Symbol>(rules); |
| Symbol* result = symbol.get(); |
| generated_symbols_.push_back(std::move(symbol)); |
| return result; |
| } |
| |
| // Helper functions to define lexer patterns. If they match, they return true |
| // and advance {pos}. Otherwise, {pos} is unchanged. |
| V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int), |
| InputPosition* pos); |
| V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char), |
| InputPosition* pos); |
| V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos); |
| V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos); |
| |
| // The action MatchInput() produces the input matched by the rule as |
| // result. |
| static base::Optional<ParseResult> YieldMatchedInput( |
| ParseResultIterator* child_results) { |
| return ParseResult{child_results->matched_input().ToString()}; |
| } |
| |
| // Create a new symbol to parse the given sequence of symbols. |
| // At most one of the symbols can return a result. |
| Symbol* Sequence(std::vector<Symbol*> symbols) { |
| return NewSymbol({Rule(std::move(symbols))}); |
| } |
| |
| template <class T, T value> |
| static base::Optional<ParseResult> YieldIntegralConstant( |
| ParseResultIterator* child_results) { |
| return ParseResult{value}; |
| } |
| |
| template <class T> |
| static base::Optional<ParseResult> YieldDefaultValue( |
| ParseResultIterator* child_results) { |
| return ParseResult{T{}}; |
| } |
| |
| template <class From, class To> |
| static base::Optional<ParseResult> CastParseResult( |
| ParseResultIterator* child_results) { |
| To result = std::move(child_results->NextAs<From>()); |
| return ParseResult{std::move(result)}; |
| } |
| |
| // Try to parse {s} and return the result of type {Result} casted to {T}. |
| // Otherwise, the result is a default-constructed {T}. |
| template <class T, class Result = T> |
| Symbol* TryOrDefault(Symbol* s) { |
| return NewSymbol({Rule({s}, CastParseResult<Result, T>), |
| Rule({}, YieldDefaultValue<T>)}); |
| } |
| |
| template <class T> |
| static base::Optional<ParseResult> MakeSingletonVector( |
| ParseResultIterator* child_results) { |
| T x = child_results->NextAs<T>(); |
| std::vector<T> result; |
| result.push_back(std::move(x)); |
| return ParseResult{std::move(result)}; |
| } |
| |
| template <class T> |
| static base::Optional<ParseResult> MakeExtendedVector( |
| ParseResultIterator* child_results) { |
| std::vector<T> l = child_results->NextAs<std::vector<T>>(); |
| T x = child_results->NextAs<T>(); |
| l.push_back(std::move(x)); |
| return ParseResult{std::move(l)}; |
| } |
| |
| // For example, NonemptyList(Token("A"), Token(",")) parses any of |
| // A or A,A or A,A,A and so on. |
| template <class T> |
| Symbol* NonemptyList(Symbol* element, |
| base::Optional<Symbol*> separator = {}) { |
| Symbol* list = NewSymbol(); |
| *list = {Rule({element}, MakeSingletonVector<T>), |
| separator |
| ? Rule({list, *separator, element}, MakeExtendedVector<T>) |
| : Rule({list, element}, MakeExtendedVector<T>)}; |
| return list; |
| } |
| |
| template <class T> |
| Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) { |
| return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator)); |
| } |
| |
| template <class T> |
| Symbol* Optional(Symbol* x) { |
| return TryOrDefault<base::Optional<T>, T>(x); |
| } |
| |
| Symbol* CheckIf(Symbol* x) { |
| return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>), |
| Rule({}, YieldIntegralConstant<bool, false>)}); |
| } |
| |
| Lexer& lexer() { return lexer_; } |
| |
| private: |
| Lexer lexer_; |
| std::vector<std::unique_ptr<Symbol>> generated_symbols_; |
| Symbol* start_; |
| }; |
| |
| } // namespace torque |
| } // namespace internal |
| } // namespace v8 |
| |
| #endif // V8_TORQUE_EARLEY_PARSER_H_ |