blob: b255dccc05e2b387619ae3f2edbf5614481491b6 [file] [log] [blame]
// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_PARSING_SCANNER_INL_H_
#define V8_PARSING_SCANNER_INL_H_
#include "src/parsing/keywords-gen.h"
#include "src/parsing/scanner.h"
#include "src/strings/char-predicates-inl.h"
#include "src/utils/utils.h"
namespace v8 {
namespace internal {
// ----------------------------------------------------------------------------
// Keyword Matcher
#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
KEYWORD_GROUP('a') \
KEYWORD("async", Token::ASYNC) \
KEYWORD("await", Token::AWAIT) \
KEYWORD_GROUP('b') \
KEYWORD("break", Token::BREAK) \
KEYWORD_GROUP('c') \
KEYWORD("case", Token::CASE) \
KEYWORD("catch", Token::CATCH) \
KEYWORD("class", Token::CLASS) \
KEYWORD("const", Token::CONST) \
KEYWORD("continue", Token::CONTINUE) \
KEYWORD_GROUP('d') \
KEYWORD("debugger", Token::DEBUGGER) \
KEYWORD("default", Token::DEFAULT) \
KEYWORD("delete", Token::DELETE) \
KEYWORD("do", Token::DO) \
KEYWORD_GROUP('e') \
KEYWORD("else", Token::ELSE) \
KEYWORD("enum", Token::ENUM) \
KEYWORD("export", Token::EXPORT) \
KEYWORD("extends", Token::EXTENDS) \
KEYWORD_GROUP('f') \
KEYWORD("false", Token::FALSE_LITERAL) \
KEYWORD("finally", Token::FINALLY) \
KEYWORD("for", Token::FOR) \
KEYWORD("function", Token::FUNCTION) \
KEYWORD_GROUP('g') \
KEYWORD("get", Token::GET) \
KEYWORD_GROUP('i') \
KEYWORD("if", Token::IF) \
KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
KEYWORD("import", Token::IMPORT) \
KEYWORD("in", Token::IN) \
KEYWORD("instanceof", Token::INSTANCEOF) \
KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \
KEYWORD_GROUP('l') \
KEYWORD("let", Token::LET) \
KEYWORD_GROUP('n') \
KEYWORD("new", Token::NEW) \
KEYWORD("null", Token::NULL_LITERAL) \
KEYWORD_GROUP('p') \
KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \
KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \
KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \
KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \
KEYWORD_GROUP('r') \
KEYWORD("return", Token::RETURN) \
KEYWORD_GROUP('s') \
KEYWORD("set", Token::SET) \
KEYWORD("static", Token::STATIC) \
KEYWORD("super", Token::SUPER) \
KEYWORD("switch", Token::SWITCH) \
KEYWORD_GROUP('t') \
KEYWORD("this", Token::THIS) \
KEYWORD("throw", Token::THROW) \
KEYWORD("true", Token::TRUE_LITERAL) \
KEYWORD("try", Token::TRY) \
KEYWORD("typeof", Token::TYPEOF) \
KEYWORD_GROUP('v') \
KEYWORD("var", Token::VAR) \
KEYWORD("void", Token::VOID) \
KEYWORD_GROUP('w') \
KEYWORD("while", Token::WHILE) \
KEYWORD("with", Token::WITH) \
KEYWORD_GROUP('y') \
KEYWORD("yield", Token::YIELD)
constexpr bool IsKeywordStart(char c) {
#define KEYWORD_GROUP_CHECK(ch) c == ch ||
#define KEYWORD_CHECK(keyword, token)
return KEYWORDS(KEYWORD_GROUP_CHECK, KEYWORD_CHECK) /* || */ false;
#undef KEYWORD_GROUP_CHECK
#undef KEYWORD_CHECK
}
V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
int input_length) {
DCHECK_GE(input_length, 1);
return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
input_length);
}
// Recursive constexpr template magic to check if a character is in a given
// string.
template <int N>
constexpr bool IsInString(const char (&s)[N], char c, size_t i = 0) {
return i >= N ? false : s[i] == c ? true : IsInString(s, c, i + 1);
}
inline constexpr bool CanBeKeywordCharacter(char c) {
return IsInString(
#define KEYWORD_GROUP_CASE(ch) // Nothing
#define KEYWORD(keyword, token) keyword
// Use C string literal concatenation ("a" "b" becomes "ab") to build one
// giant string containing all the keywords.
KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
#undef KEYWORD
#undef KEYWORD_GROUP_CASE
,
c);
}
// Make sure tokens are stored as a single byte.
STATIC_ASSERT(sizeof(Token::Value) == 1);
// Get the shortest token that this character starts, the token may change
// depending on subsequent characters.
constexpr Token::Value GetOneCharToken(char c) {
// clang-format off
return
c == '(' ? Token::LPAREN :
c == ')' ? Token::RPAREN :
c == '{' ? Token::LBRACE :
c == '}' ? Token::RBRACE :
c == '[' ? Token::LBRACK :
c == ']' ? Token::RBRACK :
c == '?' ? Token::CONDITIONAL :
c == ':' ? Token::COLON :
c == ';' ? Token::SEMICOLON :
c == ',' ? Token::COMMA :
c == '.' ? Token::PERIOD :
c == '|' ? Token::BIT_OR :
c == '&' ? Token::BIT_AND :
c == '^' ? Token::BIT_XOR :
c == '~' ? Token::BIT_NOT :
c == '!' ? Token::NOT :
c == '<' ? Token::LT :
c == '>' ? Token::GT :
c == '%' ? Token::MOD :
c == '=' ? Token::ASSIGN :
c == '+' ? Token::ADD :
c == '-' ? Token::SUB :
c == '*' ? Token::MUL :
c == '/' ? Token::DIV :
c == '#' ? Token::PRIVATE_NAME :
c == '"' ? Token::STRING :
c == '\'' ? Token::STRING :
c == '`' ? Token::TEMPLATE_SPAN :
c == '\\' ? Token::IDENTIFIER :
// Whitespace or line terminator
c == ' ' ? Token::WHITESPACE :
c == '\t' ? Token::WHITESPACE :
c == '\v' ? Token::WHITESPACE :
c == '\f' ? Token::WHITESPACE :
c == '\r' ? Token::WHITESPACE :
c == '\n' ? Token::WHITESPACE :
// IsDecimalDigit must be tested before IsAsciiIdentifier
IsDecimalDigit(c) ? Token::NUMBER :
IsAsciiIdentifier(c) ? Token::IDENTIFIER :
Token::ILLEGAL;
// clang-format on
}
// Table of one-character tokens, by character (0x00..0x7F only).
static const constexpr Token::Value one_char_tokens[128] = {
#define CALL_GET_SCAN_FLAGS(N) GetOneCharToken(N),
INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
#undef CALL_GET_SCAN_FLAGS
};
#undef KEYWORDS
V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword() {
next().literal_chars.Start();
return ScanIdentifierOrKeywordInner();
}
// Character flags for the fast path of scanning a keyword or identifier token.
enum class ScanFlags : uint8_t {
kTerminatesLiteral = 1 << 0,
// "Cannot" rather than "can" so that this flag can be ORed together across
// multiple characters.
kCannotBeKeyword = 1 << 1,
kCannotBeKeywordStart = 1 << 2,
kStringTerminator = 1 << 3,
kIdentifierNeedsSlowPath = 1 << 4,
kMultilineCommentCharacterNeedsSlowPath = 1 << 5,
};
constexpr uint8_t GetScanFlags(char c) {
return
// Keywords are all lowercase and only contain letters.
// Note that non-identifier characters do not set this flag, so
// that it plays well with kTerminatesLiteral.
(IsAsciiIdentifier(c) && !CanBeKeywordCharacter(c)
? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)
: 0) |
(IsKeywordStart(c)
? 0
: static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart)) |
// Anything that isn't an identifier character will terminate the
// literal, or at least terminates the literal fast path processing
// (like an escape).
(!IsAsciiIdentifier(c)
? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)
: 0) |
// Possible string termination characters.
((c == '\'' || c == '"' || c == '\n' || c == '\r' || c == '\\')
? static_cast<uint8_t>(ScanFlags::kStringTerminator)
: 0) |
// Escapes are processed on the slow path.
(c == '\\' ? static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath)
: 0) |
// Newlines and * are interesting characters for multiline comment
// scanning.
(c == '\n' || c == '\r' || c == '*'
? static_cast<uint8_t>(
ScanFlags::kMultilineCommentCharacterNeedsSlowPath)
: 0);
}
inline bool TerminatesLiteral(uint8_t scan_flags) {
return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral));
}
inline bool CanBeKeyword(uint8_t scan_flags) {
return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword));
}
inline bool IdentifierNeedsSlowPath(uint8_t scan_flags) {
return (scan_flags &
static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath));
}
inline bool MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags) {
return (scan_flags & static_cast<uint8_t>(
ScanFlags::kMultilineCommentCharacterNeedsSlowPath));
}
inline bool MayTerminateString(uint8_t scan_flags) {
return (scan_flags & static_cast<uint8_t>(ScanFlags::kStringTerminator));
}
// Table of precomputed scan flags for the 128 ASCII characters, for branchless
// flag calculation during the scan.
static constexpr const uint8_t character_scan_flags[128] = {
#define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N),
INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
#undef CALL_GET_SCAN_FLAGS
};
inline bool CharCanBeKeyword(uc32 c) {
return static_cast<uint32_t>(c) < arraysize(character_scan_flags) &&
CanBeKeyword(character_scan_flags[c]);
}
V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner() {
DCHECK(IsIdentifierStart(c0_));
bool escaped = false;
bool can_be_keyword = true;
STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
if (V8_LIKELY(c0_ != '\\')) {
uint8_t scan_flags = character_scan_flags[c0_];
DCHECK(!TerminatesLiteral(scan_flags));
STATIC_ASSERT(static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart) ==
static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) << 1);
scan_flags >>= 1;
// Make sure the shifting above doesn't set IdentifierNeedsSlowPath.
// Otherwise we'll fall into the slow path after scanning the identifier.
DCHECK(!IdentifierNeedsSlowPath(scan_flags));
AddLiteralChar(static_cast<char>(c0_));
AdvanceUntil([this, &scan_flags](uc32 c0) {
if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
// A non-ascii character means we need to drop through to the slow
// path.
// TODO(leszeks): This would be most efficient as a goto to the slow
// path, check codegen and maybe use a bool instead.
scan_flags |=
static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath);
return true;
}
uint8_t char_flags = character_scan_flags[c0];
scan_flags |= char_flags;
if (TerminatesLiteral(char_flags)) {
return true;
} else {
AddLiteralChar(static_cast<char>(c0));
return false;
}
});
if (V8_LIKELY(!IdentifierNeedsSlowPath(scan_flags))) {
if (!CanBeKeyword(scan_flags)) return Token::IDENTIFIER;
// Could be a keyword or identifier.
Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
return KeywordOrIdentifierToken(chars.begin(), chars.length());
}
can_be_keyword = CanBeKeyword(scan_flags);
} else {
// Special case for escapes at the start of an identifier.
escaped = true;
uc32 c = ScanIdentifierUnicodeEscape();
DCHECK(!IsIdentifierStart(Invalid()));
if (c == '\\' || !IsIdentifierStart(c)) {
return Token::ILLEGAL;
}
AddLiteralChar(c);
can_be_keyword = CharCanBeKeyword(c);
}
}
return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword);
}
V8_INLINE Token::Value Scanner::SkipWhiteSpace() {
int start_position = source_pos();
// We won't skip behind the end of input.
DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
// Advance as long as character is a WhiteSpace or LineTerminator.
while (IsWhiteSpaceOrLineTerminator(c0_)) {
if (!next().after_line_terminator && unibrow::IsLineTerminator(c0_)) {
next().after_line_terminator = true;
}
Advance();
}
// Return whether or not we skipped any characters.
if (source_pos() == start_position) {
DCHECK_NE('0', c0_);
return Token::ILLEGAL;
}
return Token::WHITESPACE;
}
V8_INLINE Token::Value Scanner::ScanSingleToken() {
Token::Value token;
do {
next().location.beg_pos = source_pos();
if (V8_LIKELY(static_cast<unsigned>(c0_) <= kMaxAscii)) {
token = one_char_tokens[c0_];
switch (token) {
case Token::LPAREN:
case Token::RPAREN:
case Token::LBRACE:
case Token::RBRACE:
case Token::LBRACK:
case Token::RBRACK:
case Token::COLON:
case Token::SEMICOLON:
case Token::COMMA:
case Token::BIT_NOT:
case Token::ILLEGAL:
// One character tokens.
return Select(token);
case Token::CONDITIONAL:
// ? ?. ?? ??=
Advance();
if (c0_ == '.') {
Advance();
if (!IsDecimalDigit(c0_)) return Token::QUESTION_PERIOD;
PushBack('.');
} else if (c0_ == '?') {
return Select('=', Token::ASSIGN_NULLISH, Token::NULLISH);
}
return Token::CONDITIONAL;
case Token::STRING:
return ScanString();
case Token::LT:
// < <= << <<= <!--
Advance();
if (c0_ == '=') return Select(Token::LTE);
if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
if (c0_ == '!') {
token = ScanHtmlComment();
continue;
}
return Token::LT;
case Token::GT:
// > >= >> >>= >>> >>>=
Advance();
if (c0_ == '=') return Select(Token::GTE);
if (c0_ == '>') {
// >> >>= >>> >>>=
Advance();
if (c0_ == '=') return Select(Token::ASSIGN_SAR);
if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
return Token::SAR;
}
return Token::GT;
case Token::ASSIGN:
// = == === =>
Advance();
if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
if (c0_ == '>') return Select(Token::ARROW);
return Token::ASSIGN;
case Token::NOT:
// ! != !==
Advance();
if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
return Token::NOT;
case Token::ADD:
// + ++ +=
Advance();
if (c0_ == '+') return Select(Token::INC);
if (c0_ == '=') return Select(Token::ASSIGN_ADD);
return Token::ADD;
case Token::SUB:
// - -- --> -=
Advance();
if (c0_ == '-') {
Advance();
if (c0_ == '>' && next().after_line_terminator) {
// For compatibility with SpiderMonkey, we skip lines that
// start with an HTML comment end '-->'.
token = SkipSingleHTMLComment();
continue;
}
return Token::DEC;
}
if (c0_ == '=') return Select(Token::ASSIGN_SUB);
return Token::SUB;
case Token::MUL:
// * *=
Advance();
if (c0_ == '*') return Select('=', Token::ASSIGN_EXP, Token::EXP);
if (c0_ == '=') return Select(Token::ASSIGN_MUL);
return Token::MUL;
case Token::MOD:
// % %=
return Select('=', Token::ASSIGN_MOD, Token::MOD);
case Token::DIV:
// / // /* /=
Advance();
if (c0_ == '/') {
uc32 c = Peek();
if (c == '#' || c == '@') {
Advance();
Advance();
token = SkipSourceURLComment();
continue;
}
token = SkipSingleLineComment();
continue;
}
if (c0_ == '*') {
token = SkipMultiLineComment();
continue;
}
if (c0_ == '=') return Select(Token::ASSIGN_DIV);
return Token::DIV;
case Token::BIT_AND:
// & && &= &&=
Advance();
if (c0_ == '&') return Select('=', Token::ASSIGN_AND, Token::AND);
if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
return Token::BIT_AND;
case Token::BIT_OR:
// | || |= ||=
Advance();
if (c0_ == '|') return Select('=', Token::ASSIGN_OR, Token::OR);
if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
return Token::BIT_OR;
case Token::BIT_XOR:
// ^ ^=
return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
case Token::PERIOD:
// . Number
Advance();
if (IsDecimalDigit(c0_)) return ScanNumber(true);
if (c0_ == '.') {
if (Peek() == '.') {
Advance();
Advance();
return Token::ELLIPSIS;
}
}
return Token::PERIOD;
case Token::TEMPLATE_SPAN:
Advance();
return ScanTemplateSpan();
case Token::PRIVATE_NAME:
if (source_pos() == 0 && Peek() == '!') {
token = SkipSingleLineComment();
continue;
}
return ScanPrivateName();
case Token::WHITESPACE:
token = SkipWhiteSpace();
continue;
case Token::NUMBER:
return ScanNumber(false);
case Token::IDENTIFIER:
return ScanIdentifierOrKeyword();
default:
UNREACHABLE();
}
}
if (IsIdentifierStart(c0_) ||
(CombineSurrogatePair() && IsIdentifierStart(c0_))) {
return ScanIdentifierOrKeyword();
}
if (c0_ == kEndOfInput) {
return source_->has_parser_error() ? Token::ILLEGAL : Token::EOS;
}
token = SkipWhiteSpace();
// Continue scanning for tokens as long as we're just skipping whitespace.
} while (token == Token::WHITESPACE);
return token;
}
void Scanner::Scan(TokenDesc* next_desc) {
DCHECK_EQ(next_desc, &next());
next_desc->token = ScanSingleToken();
DCHECK_IMPLIES(has_parser_error(), next_desc->token == Token::ILLEGAL);
next_desc->location.end_pos = source_pos();
#ifdef DEBUG
SanityCheckTokenDesc(current());
SanityCheckTokenDesc(next());
SanityCheckTokenDesc(next_next());
#endif
}
void Scanner::Scan() { Scan(next_); }
} // namespace internal
} // namespace v8
#endif // V8_PARSING_SCANNER_INL_H_