| // Copyright 2018 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include <algorithm> |
| #include <set> |
| #include <unordered_map> |
| #include <unordered_set> |
| |
| #include "src/torque/ast.h" |
| #include "src/torque/earley-parser.h" |
| #include "src/torque/utils.h" |
| |
| namespace v8 { |
| namespace internal { |
| namespace torque { |
| |
| namespace { |
| |
| struct LineAndColumnTracker { |
| LineAndColumn previous{0, 0}; |
| LineAndColumn current{0, 0}; |
| |
| void Advance(InputPosition from, InputPosition to) { |
| previous = current; |
| while (from != to) { |
| if (*from == '\n') { |
| current.line += 1; |
| current.column = 0; |
| } else { |
| current.column += 1; |
| } |
| ++from; |
| } |
| } |
| |
| SourcePosition ToSourcePosition() { |
| return {CurrentSourceFile::Get(), previous, current}; |
| } |
| }; |
| |
| } // namespace |
| |
| base::Optional<ParseResult> Rule::RunAction(const Item* completed_item, |
| const LexerResult& tokens) const { |
| std::vector<ParseResult> results; |
| for (const Item* child : completed_item->Children()) { |
| if (!child) continue; |
| base::Optional<ParseResult> child_result = |
| child->left()->RunAction(child, tokens); |
| if (child_result) results.push_back(std::move(*child_result)); |
| } |
| MatchedInput matched_input = completed_item->GetMatchedInput(tokens); |
| CurrentSourcePosition::Scope pos_scope(matched_input.pos); |
| ParseResultIterator iterator(std::move(results), matched_input); |
| return action_(&iterator); |
| } |
| |
| Symbol& Symbol::operator=(std::initializer_list<Rule> rules) { |
| rules_.clear(); |
| for (const Rule& rule : rules) { |
| AddRule(rule); |
| } |
| return *this; |
| } |
| |
| std::vector<const Item*> Item::Children() const { |
| std::vector<const Item*> children; |
| for (const Item* current = this; current->prev_; current = current->prev_) { |
| children.push_back(current->child_); |
| } |
| // The above loop collects the child nodes in reversed order. |
| std::reverse(children.begin(), children.end()); |
| DCHECK_EQ(children.size(), right().size()); |
| return children; |
| } |
| |
| std::string Item::SplitByChildren(const LexerResult& tokens) const { |
| if (right().size() == 1) { |
| if (const Item* child = Children()[0]) |
| return child->SplitByChildren(tokens); |
| } |
| std::stringstream s; |
| bool first = true; |
| for (const Item* item : Children()) { |
| if (!item) continue; |
| if (!first) s << " "; |
| s << item->GetMatchedInput(tokens).ToString(); |
| first = false; |
| } |
| return s.str(); |
| } |
| |
| void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const { |
| DCHECK(*this == other); |
| if (child_ != other.child_) { |
| std::stringstream s; |
| s << "Ambiguous grammer rules for \"" |
| << child_->GetMatchedInput(tokens).ToString() << "\":\n " |
| << child_->SplitByChildren(tokens) << "\nvs\n " |
| << other.child_->SplitByChildren(tokens); |
| ReportError(s.str()); |
| } |
| if (prev_ != other.prev_) { |
| std::stringstream s; |
| s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString() |
| << "\":\n " << SplitByChildren(tokens) << " ...\nvs\n " |
| << other.SplitByChildren(tokens) << " ..."; |
| ReportError(s.str()); |
| } |
| } |
| |
| LexerResult Lexer::RunLexer(const std::string& input) { |
| LexerResult result; |
| InputPosition const begin = input.c_str(); |
| InputPosition const end = begin + input.size(); |
| InputPosition pos = begin; |
| InputPosition token_start = pos; |
| LineAndColumnTracker line_column_tracker; |
| |
| match_whitespace_(&pos); |
| line_column_tracker.Advance(token_start, pos); |
| while (pos != end) { |
| token_start = pos; |
| Symbol* symbol = MatchToken(&pos, end); |
| InputPosition token_end = pos; |
| line_column_tracker.Advance(token_start, token_end); |
| if (!symbol) { |
| CurrentSourcePosition::Scope pos_scope( |
| line_column_tracker.ToSourcePosition()); |
| ReportError("Lexer Error: unknown token " + |
| StringLiteralQuote(std::string( |
| token_start, token_start + std::min<ptrdiff_t>( |
| end - token_start, 10)))); |
| } |
| result.token_symbols.push_back(symbol); |
| result.token_contents.push_back( |
| {token_start, pos, line_column_tracker.ToSourcePosition()}); |
| match_whitespace_(&pos); |
| line_column_tracker.Advance(token_end, pos); |
| } |
| |
| // Add an additional token position to simplify corner cases. |
| line_column_tracker.Advance(token_start, pos); |
| result.token_contents.push_back( |
| {pos, pos, line_column_tracker.ToSourcePosition()}); |
| return result; |
| } |
| |
| Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) { |
| InputPosition token_start = *pos; |
| Symbol* symbol = nullptr; |
| // Find longest matching pattern. |
| for (std::pair<const PatternFunction, Symbol>& pair : patterns_) { |
| InputPosition token_end = token_start; |
| PatternFunction matchPattern = pair.first; |
| if (matchPattern(&token_end) && token_end > *pos) { |
| *pos = token_end; |
| symbol = &pair.second; |
| } |
| } |
| size_t pattern_size = *pos - token_start; |
| |
| // Now check for keywords. Prefer keywords over patterns unless the pattern is |
| // longer. Iterate from the end to ensure that if one keyword is a prefix of |
| // another, we first try to match the longer one. |
| for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) { |
| const std::string& keyword = it->first; |
| if (static_cast<size_t>(end - token_start) < keyword.size()) continue; |
| if (keyword.size() >= pattern_size && |
| keyword == std::string(token_start, token_start + keyword.size())) { |
| *pos = token_start + keyword.size(); |
| return &it->second; |
| } |
| } |
| if (pattern_size > 0) return symbol; |
| return nullptr; |
| } |
| |
| // This is an implementation of Earley's parsing algorithm |
| // (https://en.wikipedia.org/wiki/Earley_parser). |
| const Item* RunEarleyAlgorithm( |
| Symbol* start, const LexerResult& tokens, |
| std::unordered_set<Item, base::hash<Item>>* processed) { |
| // Worklist for items at the current position. |
| std::vector<Item> worklist; |
| // Worklist for items at the next position. |
| std::vector<Item> future_items; |
| CurrentSourcePosition::Scope source_position( |
| SourcePosition{CurrentSourceFile::Get(), {0, 0}, {0, 0}}); |
| std::vector<const Item*> completed_items; |
| std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>, |
| base::hash<std::pair<size_t, Symbol*>>> |
| waiting; |
| |
| std::vector<const Item*> debug_trace; |
| |
| // Start with one top_level symbol mapping to the start symbol of the grammar. |
| // This simplifies things because the start symbol might have several |
| // rules. |
| Symbol top_level; |
| top_level.AddRule(Rule({start})); |
| worklist.push_back(Item{top_level.rule(0), 0, 0, 0}); |
| |
| size_t input_length = tokens.token_symbols.size(); |
| |
| for (size_t pos = 0; pos <= input_length; ++pos) { |
| while (!worklist.empty()) { |
| auto insert_result = processed->insert(worklist.back()); |
| const Item& item = *insert_result.first; |
| DCHECK_EQ(pos, item.pos()); |
| MatchedInput last_token = tokens.token_contents[pos]; |
| CurrentSourcePosition::Get() = last_token.pos; |
| bool is_new = insert_result.second; |
| if (!is_new) item.CheckAmbiguity(worklist.back(), tokens); |
| worklist.pop_back(); |
| if (!is_new) continue; |
| |
| debug_trace.push_back(&item); |
| if (item.IsComplete()) { |
| // 'Complete' phase: Advance all items that were waiting to match this |
| // symbol next. |
| for (const Item* parent : waiting[{item.start(), item.left()}]) { |
| worklist.push_back(parent->Advance(pos, &item)); |
| } |
| } else { |
| Symbol* next = item.NextSymbol(); |
| // 'Scan' phase: Check if {next} is the next symbol in the input (this |
| // is never the case if {next} is a non-terminal). |
| if (pos < tokens.token_symbols.size() && |
| tokens.token_symbols[pos] == next) { |
| future_items.push_back(item.Advance(pos + 1, nullptr)); |
| } |
| // 'Predict' phase: Add items for every rule of the non-terminal. |
| if (!next->IsTerminal()) { |
| // Remember that this item is waiting for completion with {next}. |
| waiting[{pos, next}].insert(&item); |
| } |
| for (size_t i = 0; i < next->rule_number(); ++i) { |
| Rule* rule = next->rule(i); |
| auto already_completed = |
| processed->find(Item{rule, rule->right().size(), pos, pos}); |
| // As discussed in section 3 of |
| // Aycock, John, and R. Nigel Horspool. "Practical earley |
| // parsing." The Computer Journal 45.6 (2002): 620-630. |
| // Earley parsing has the following problem with epsilon rules: |
| // When we complete an item that started at the current position |
| // (that is, it matched zero tokens), we might not yet have |
| // predicted all items it can complete with. Thus we check for the |
| // existence of such items here and complete them immediately. |
| if (already_completed != processed->end()) { |
| worklist.push_back(item.Advance(pos, &*already_completed)); |
| } else { |
| worklist.push_back(Item{rule, 0, pos, pos}); |
| } |
| } |
| } |
| } |
| std::swap(worklist, future_items); |
| } |
| |
| auto final_item = |
| processed->find(Item{top_level.rule(0), 1, 0, input_length}); |
| if (final_item != processed->end()) { |
| // Success: The {top_level} rule matches the complete input. |
| return final_item->Children()[0]; |
| } |
| std::string reason; |
| const Item& last_item = *debug_trace.back(); |
| if (last_item.pos() < tokens.token_symbols.size()) { |
| std::string next_token = tokens.token_contents[last_item.pos()].ToString(); |
| reason = "unexpected token \"" + next_token + "\""; |
| } else { |
| reason = "unexpected end of input"; |
| } |
| ReportError("Parser Error: " + reason); |
| } |
| |
| // static |
| bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) { |
| if (**pos && char_class(static_cast<unsigned char>(**pos))) { |
| ++*pos; |
| return true; |
| } |
| return false; |
| } |
| |
| // static |
| bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) { |
| if (**pos && char_class(**pos)) { |
| ++*pos; |
| return true; |
| } |
| return false; |
| } |
| |
| // static |
| bool Grammar::MatchString(const char* s, InputPosition* pos) { |
| InputPosition current = *pos; |
| for (; *s != 0; ++s, ++current) { |
| if (*s != *current) return false; |
| } |
| *pos = current; |
| return true; |
| } |
| |
| // static |
| bool Grammar::MatchAnyChar(InputPosition* pos) { |
| return MatchChar([](char c) { return true; }, pos); |
| } |
| |
| } // namespace torque |
| } // namespace internal |
| } // namespace v8 |