| import { assert } from './assert'; |
| import { Character } from './character'; |
| import { ErrorHandler } from './error-handler'; |
| import { Messages } from './messages'; |
| import { Token } from './token'; |
| |
| function hexValue(ch: string): number { |
| return '0123456789abcdef'.indexOf(ch.toLowerCase()); |
| } |
| |
| function octalValue(ch: string): number { |
| return '01234567'.indexOf(ch); |
| } |
| |
| export interface Position { |
| line: number; |
| column: number; |
| } |
| |
| export interface SourceLocation { |
| start: Position; |
| end: Position; |
| source?: string; |
| } |
| |
| export interface Comment { |
| multiLine: boolean; |
| slice: number[]; |
| range: [number, number]; |
| loc: SourceLocation; |
| } |
| |
| export interface RawToken { |
| type: Token; |
| value: string | number; |
| pattern?: string; |
| flags?: string; |
| regex?: RegExp | null; |
| octal?: boolean; |
| cooked?: string; |
| head?: boolean; |
| tail?: boolean; |
| lineNumber: number; |
| lineStart: number; |
| start: number; |
| end: number; |
| } |
| |
| interface ScannerState { |
| index: number; |
| lineNumber: number; |
| lineStart: number; |
| curlyStack: string[]; |
| } |
| |
| export class Scanner { |
| |
| readonly source: string; |
| readonly errorHandler: ErrorHandler; |
| trackComment: boolean; |
| isModule: boolean; |
| |
| index: number; |
| lineNumber: number; |
| lineStart: number; |
| curlyStack: string[]; |
| |
| private readonly length: number; |
| |
| constructor(code: string, handler: ErrorHandler) { |
| this.source = code; |
| this.errorHandler = handler; |
| this.trackComment = false; |
| this.isModule = false; |
| |
| this.length = code.length; |
| this.index = 0; |
| this.lineNumber = (code.length > 0) ? 1 : 0; |
| this.lineStart = 0; |
| this.curlyStack = []; |
| } |
| |
| public saveState(): ScannerState { |
| return { |
| index: this.index, |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| curlyStack: this.curlyStack.slice() |
| }; |
| } |
| |
| public restoreState(state: ScannerState): void { |
| this.index = state.index; |
| this.lineNumber = state.lineNumber; |
| this.lineStart = state.lineStart; |
| this.curlyStack = state.curlyStack; |
| } |
| |
| public eof(): boolean { |
| return this.index >= this.length; |
| } |
| |
| public throwUnexpectedToken(message = Messages.UnexpectedTokenIllegal): never { |
| return this.errorHandler.throwError(this.index, this.lineNumber, |
| this.index - this.lineStart + 1, message); |
| } |
| |
| private tolerateUnexpectedToken(message = Messages.UnexpectedTokenIllegal) { |
| this.errorHandler.tolerateError(this.index, this.lineNumber, |
| this.index - this.lineStart + 1, message); |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-comments |
| |
| private skipSingleLineComment(offset: number): Comment[] { |
| let comments: Comment[] = []; |
| let start, loc; |
| |
| if (this.trackComment) { |
| comments = []; |
| start = this.index - offset; |
| loc = { |
| start: { |
| line: this.lineNumber, |
| column: this.index - this.lineStart - offset |
| }, |
| end: {} |
| }; |
| } |
| |
| while (!this.eof()) { |
| const ch = this.source.charCodeAt(this.index); |
| ++this.index; |
| if (Character.isLineTerminator(ch)) { |
| if (this.trackComment) { |
| loc.end = { |
| line: this.lineNumber, |
| column: this.index - this.lineStart - 1 |
| }; |
| const entry: Comment = { |
| multiLine: false, |
| slice: [start + offset, this.index - 1], |
| range: [start, this.index - 1], |
| loc: loc |
| }; |
| comments.push(entry); |
| } |
| if (ch === 13 && this.source.charCodeAt(this.index) === 10) { |
| ++this.index; |
| } |
| ++this.lineNumber; |
| this.lineStart = this.index; |
| return comments; |
| } |
| } |
| |
| if (this.trackComment) { |
| loc.end = { |
| line: this.lineNumber, |
| column: this.index - this.lineStart |
| }; |
| const entry: Comment = { |
| multiLine: false, |
| slice: [start + offset, this.index], |
| range: [start, this.index], |
| loc: loc |
| }; |
| comments.push(entry); |
| } |
| |
| return comments; |
| } |
| |
| private skipMultiLineComment(): Comment[] { |
| let comments: Comment[] = []; |
| let start, loc; |
| |
| if (this.trackComment) { |
| comments = []; |
| start = this.index - 2; |
| loc = { |
| start: { |
| line: this.lineNumber, |
| column: this.index - this.lineStart - 2 |
| }, |
| end: {} |
| }; |
| } |
| |
| while (!this.eof()) { |
| const ch = this.source.charCodeAt(this.index); |
| if (Character.isLineTerminator(ch)) { |
| if (ch === 0x0D && this.source.charCodeAt(this.index + 1) === 0x0A) { |
| ++this.index; |
| } |
| ++this.lineNumber; |
| ++this.index; |
| this.lineStart = this.index; |
| } else if (ch === 0x2A) { |
| // Block comment ends with '*/'. |
| if (this.source.charCodeAt(this.index + 1) === 0x2F) { |
| this.index += 2; |
| if (this.trackComment) { |
| loc.end = { |
| line: this.lineNumber, |
| column: this.index - this.lineStart |
| }; |
| const entry: Comment = { |
| multiLine: true, |
| slice: [start + 2, this.index - 2], |
| range: [start, this.index], |
| loc: loc |
| }; |
| comments.push(entry); |
| } |
| return comments; |
| } |
| ++this.index; |
| } else { |
| ++this.index; |
| } |
| } |
| |
| // Ran off the end of the file - the whole thing is a comment |
| if (this.trackComment) { |
| loc.end = { |
| line: this.lineNumber, |
| column: this.index - this.lineStart |
| }; |
| const entry: Comment = { |
| multiLine: true, |
| slice: [start + 2, this.index], |
| range: [start, this.index], |
| loc: loc |
| }; |
| comments.push(entry); |
| } |
| |
| this.tolerateUnexpectedToken(); |
| return comments; |
| } |
| |
| public scanComments() { |
| let comments; |
| if (this.trackComment) { |
| comments = []; |
| } |
| |
| let start = (this.index === 0); |
| while (!this.eof()) { |
| let ch = this.source.charCodeAt(this.index); |
| |
| if (Character.isWhiteSpace(ch)) { |
| ++this.index; |
| } else if (Character.isLineTerminator(ch)) { |
| ++this.index; |
| if (ch === 0x0D && this.source.charCodeAt(this.index) === 0x0A) { |
| ++this.index; |
| } |
| ++this.lineNumber; |
| this.lineStart = this.index; |
| start = true; |
| } else if (ch === 0x2F) { // U+002F is '/' |
| ch = this.source.charCodeAt(this.index + 1); |
| if (ch === 0x2F) { |
| this.index += 2; |
| const comment = this.skipSingleLineComment(2); |
| if (this.trackComment) { |
| comments = comments.concat(comment); |
| } |
| start = true; |
| } else if (ch === 0x2A) { // U+002A is '*' |
| this.index += 2; |
| const comment = this.skipMultiLineComment(); |
| if (this.trackComment) { |
| comments = comments.concat(comment); |
| } |
| } else { |
| break; |
| } |
| } else if (start && ch === 0x2D) { // U+002D is '-' |
| // U+003E is '>' |
| if ((this.source.charCodeAt(this.index + 1) === 0x2D) && (this.source.charCodeAt(this.index + 2) === 0x3E)) { |
| // '-->' is a single-line comment |
| this.index += 3; |
| const comment = this.skipSingleLineComment(3); |
| if (this.trackComment) { |
| comments = comments.concat(comment); |
| } |
| } else { |
| break; |
| } |
| } else if (ch === 0x3C && !this.isModule) { // U+003C is '<' |
| if (this.source.slice(this.index + 1, this.index + 4) === '!--') { |
| this.index += 4; // `<!--` |
| const comment = this.skipSingleLineComment(4); |
| if (this.trackComment) { |
| comments = comments.concat(comment); |
| } |
| } else { |
| break; |
| } |
| } else { |
| break; |
| } |
| } |
| |
| return comments; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-future-reserved-words |
| |
| public isFutureReservedWord(id: string): boolean { |
| switch (id) { |
| case 'enum': |
| case 'export': |
| case 'import': |
| case 'super': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| public isStrictModeReservedWord(id: string): boolean { |
| switch (id) { |
| case 'implements': |
| case 'interface': |
| case 'package': |
| case 'private': |
| case 'protected': |
| case 'public': |
| case 'static': |
| case 'yield': |
| case 'let': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| public isRestrictedWord(id: string): boolean { |
| return id === 'eval' || id === 'arguments'; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-keywords |
| |
| private isKeyword(id: string): boolean { |
| switch (id.length) { |
| case 2: |
| return (id === 'if') || (id === 'in') || (id === 'do'); |
| case 3: |
| return (id === 'var') || (id === 'for') || (id === 'new') || |
| (id === 'try') || (id === 'let'); |
| case 4: |
| return (id === 'this') || (id === 'else') || (id === 'case') || |
| (id === 'void') || (id === 'with') || (id === 'enum'); |
| case 5: |
| return (id === 'while') || (id === 'break') || (id === 'catch') || |
| (id === 'throw') || (id === 'const') || (id === 'yield') || |
| (id === 'class') || (id === 'super'); |
| case 6: |
| return (id === 'return') || (id === 'typeof') || (id === 'delete') || |
| (id === 'switch') || (id === 'export') || (id === 'import'); |
| case 7: |
| return (id === 'default') || (id === 'finally') || (id === 'extends'); |
| case 8: |
| return (id === 'function') || (id === 'continue') || (id === 'debugger'); |
| case 10: |
| return (id === 'instanceof'); |
| default: |
| return false; |
| } |
| } |
| |
| private codePointAt(i: number): number { |
| let cp = this.source.charCodeAt(i); |
| |
| if (cp >= 0xD800 && cp <= 0xDBFF) { |
| const second = this.source.charCodeAt(i + 1); |
| if (second >= 0xDC00 && second <= 0xDFFF) { |
| const first = cp; |
| cp = (first - 0xD800) * 0x400 + second - 0xDC00 + 0x10000; |
| } |
| } |
| |
| return cp; |
| } |
| |
| private scanHexEscape(prefix: string): string | null { |
| const len = (prefix === 'u') ? 4 : 2; |
| let code = 0; |
| |
| for (let i = 0; i < len; ++i) { |
| if (!this.eof() && Character.isHexDigit(this.source.charCodeAt(this.index))) { |
| code = code * 16 + hexValue(this.source[this.index++]); |
| } else { |
| return null; |
| } |
| } |
| return String.fromCharCode(code); |
| } |
| |
| private scanUnicodeCodePointEscape(): string { |
| let ch = this.source[this.index]; |
| let code = 0; |
| |
| // At least, one hex digit is required. |
| if (ch === '}') { |
| this.throwUnexpectedToken(); |
| } |
| |
| while (!this.eof()) { |
| ch = this.source[this.index++]; |
| if (!Character.isHexDigit(ch.charCodeAt(0))) { |
| break; |
| } |
| code = code * 16 + hexValue(ch); |
| } |
| |
| if (code > 0x10FFFF || ch !== '}') { |
| this.throwUnexpectedToken(); |
| } |
| |
| return Character.fromCodePoint(code); |
| } |
| |
| private getIdentifier(): string { |
| const start = this.index++; |
| while (!this.eof()) { |
| const ch = this.source.charCodeAt(this.index); |
| if (ch === 0x5C) { |
| // Blackslash (U+005C) marks Unicode escape sequence. |
| this.index = start; |
| return this.getComplexIdentifier(); |
| } else if (ch >= 0xD800 && ch < 0xDFFF) { |
| // Need to handle surrogate pairs. |
| this.index = start; |
| return this.getComplexIdentifier(); |
| } |
| if (Character.isIdentifierPart(ch)) { |
| ++this.index; |
| } else { |
| break; |
| } |
| } |
| |
| return this.source.slice(start, this.index); |
| } |
| |
| private getComplexIdentifier(): string { |
| let cp = this.codePointAt(this.index); |
| let id = Character.fromCodePoint(cp); |
| this.index += id.length; |
| |
| // '\u' (U+005C, U+0075) denotes an escaped character. |
| let ch; |
| if (cp === 0x5C) { |
| if (this.source.charCodeAt(this.index) !== 0x75) { |
| this.throwUnexpectedToken(); |
| } |
| ++this.index; |
| if (this.source[this.index] === '{') { |
| ++this.index; |
| ch = this.scanUnicodeCodePointEscape(); |
| } else { |
| ch = this.scanHexEscape('u'); |
| if (ch === null || ch === '\\' || !Character.isIdentifierStart(ch.charCodeAt(0))) { |
| this.throwUnexpectedToken(); |
| } |
| } |
| id = ch; |
| } |
| |
| while (!this.eof()) { |
| cp = this.codePointAt(this.index); |
| if (!Character.isIdentifierPart(cp)) { |
| break; |
| } |
| ch = Character.fromCodePoint(cp); |
| id += ch; |
| this.index += ch.length; |
| |
| // '\u' (U+005C, U+0075) denotes an escaped character. |
| if (cp === 0x5C) { |
| id = id.substr(0, id.length - 1); |
| if (this.source.charCodeAt(this.index) !== 0x75) { |
| this.throwUnexpectedToken(); |
| } |
| ++this.index; |
| if (this.source[this.index] === '{') { |
| ++this.index; |
| ch = this.scanUnicodeCodePointEscape(); |
| } else { |
| ch = this.scanHexEscape('u'); |
| if (ch === null || ch === '\\' || !Character.isIdentifierPart(ch.charCodeAt(0))) { |
| this.throwUnexpectedToken(); |
| } |
| } |
| id += ch; |
| } |
| } |
| |
| return id; |
| } |
| |
| private octalToDecimal(ch: string) { |
| // \0 is not octal escape sequence |
| let octal = (ch !== '0'); |
| let code = octalValue(ch); |
| |
| if (!this.eof() && Character.isOctalDigit(this.source.charCodeAt(this.index))) { |
| octal = true; |
| code = code * 8 + octalValue(this.source[this.index++]); |
| |
| // 3 digits are only allowed when string starts |
| // with 0, 1, 2, 3 |
| if ('0123'.indexOf(ch) >= 0 && !this.eof() && Character.isOctalDigit(this.source.charCodeAt(this.index))) { |
| code = code * 8 + octalValue(this.source[this.index++]); |
| } |
| } |
| |
| return { |
| code: code, |
| octal: octal |
| }; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-names-and-keywords |
| |
| private scanIdentifier(): RawToken { |
| let type: Token; |
| const start = this.index; |
| |
| // Backslash (U+005C) starts an escaped character. |
| const id = (this.source.charCodeAt(start) === 0x5C) ? this.getComplexIdentifier() : this.getIdentifier(); |
| |
| // There is no keyword or literal with only one character. |
| // Thus, it must be an identifier. |
| if (id.length === 1) { |
| type = Token.Identifier; |
| } else if (this.isKeyword(id)) { |
| type = Token.Keyword; |
| } else if (id === 'null') { |
| type = Token.NullLiteral; |
| } else if (id === 'true' || id === 'false') { |
| type = Token.BooleanLiteral; |
| } else { |
| type = Token.Identifier; |
| } |
| |
| if (type !== Token.Identifier && (start + id.length !== this.index)) { |
| const restore = this.index; |
| this.index = start; |
| this.tolerateUnexpectedToken(Messages.InvalidEscapedReservedWord); |
| this.index = restore; |
| } |
| |
| return { |
| type: type, |
| value: id, |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-punctuators |
| |
| private scanPunctuator(): RawToken { |
| const start = this.index; |
| |
| // Check for most common single-character punctuators. |
| let str = this.source[this.index]; |
| switch (str) { |
| |
| case '(': |
| case '{': |
| if (str === '{') { |
| this.curlyStack.push('{'); |
| } |
| ++this.index; |
| break; |
| |
| case '.': |
| ++this.index; |
| if (this.source[this.index] === '.' && this.source[this.index + 1] === '.') { |
| // Spread operator: ... |
| this.index += 2; |
| str = '...'; |
| } |
| break; |
| |
| case '}': |
| ++this.index; |
| this.curlyStack.pop(); |
| break; |
| case ')': |
| case ';': |
| case ',': |
| case '[': |
| case ']': |
| case ':': |
| case '?': |
| case '~': |
| ++this.index; |
| break; |
| |
| default: |
| // 4-character punctuator. |
| str = this.source.substr(this.index, 4); |
| if (str === '>>>=') { |
| this.index += 4; |
| } else { |
| |
| // 3-character punctuators. |
| str = str.substr(0, 3); |
| if (str === '===' || str === '!==' || str === '>>>' || |
| str === '<<=' || str === '>>=' || str === '**=') { |
| this.index += 3; |
| } else { |
| |
| // 2-character punctuators. |
| str = str.substr(0, 2); |
| if (str === '&&' || str === '||' || str === '==' || str === '!=' || |
| str === '+=' || str === '-=' || str === '*=' || str === '/=' || |
| str === '++' || str === '--' || str === '<<' || str === '>>' || |
| str === '&=' || str === '|=' || str === '^=' || str === '%=' || |
| str === '<=' || str === '>=' || str === '=>' || str === '**') { |
| this.index += 2; |
| } else { |
| |
| // 1-character punctuators. |
| str = this.source[this.index]; |
| if ('<>=!+-*%&|^/'.indexOf(str) >= 0) { |
| ++this.index; |
| } |
| } |
| } |
| } |
| } |
| |
| if (this.index === start) { |
| this.throwUnexpectedToken(); |
| } |
| |
| return { |
| type: Token.Punctuator, |
| value: str, |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-literals-numeric-literals |
| |
| private scanHexLiteral(start: number): RawToken { |
| let num = ''; |
| |
| while (!this.eof()) { |
| if (!Character.isHexDigit(this.source.charCodeAt(this.index))) { |
| break; |
| } |
| num += this.source[this.index++]; |
| } |
| |
| if (num.length === 0) { |
| this.throwUnexpectedToken(); |
| } |
| |
| if (Character.isIdentifierStart(this.source.charCodeAt(this.index))) { |
| this.throwUnexpectedToken(); |
| } |
| |
| return { |
| type: Token.NumericLiteral, |
| value: parseInt('0x' + num, 16), |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| private scanBinaryLiteral(start: number): RawToken { |
| let num = ''; |
| let ch; |
| |
| while (!this.eof()) { |
| ch = this.source[this.index]; |
| if (ch !== '0' && ch !== '1') { |
| break; |
| } |
| num += this.source[this.index++]; |
| } |
| |
| if (num.length === 0) { |
| // only 0b or 0B |
| this.throwUnexpectedToken(); |
| } |
| |
| if (!this.eof()) { |
| ch = this.source.charCodeAt(this.index); |
| /* istanbul ignore else */ |
| if (Character.isIdentifierStart(ch) || Character.isDecimalDigit(ch)) { |
| this.throwUnexpectedToken(); |
| } |
| } |
| |
| return { |
| type: Token.NumericLiteral, |
| value: parseInt(num, 2), |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| private scanOctalLiteral(prefix: string, start: number): RawToken { |
| let num = ''; |
| let octal = false; |
| |
| if (Character.isOctalDigit(prefix.charCodeAt(0))) { |
| octal = true; |
| num = '0' + this.source[this.index++]; |
| } else { |
| ++this.index; |
| } |
| |
| while (!this.eof()) { |
| if (!Character.isOctalDigit(this.source.charCodeAt(this.index))) { |
| break; |
| } |
| num += this.source[this.index++]; |
| } |
| |
| if (!octal && num.length === 0) { |
| // only 0o or 0O |
| this.throwUnexpectedToken(); |
| } |
| |
| if (Character.isIdentifierStart(this.source.charCodeAt(this.index)) || Character.isDecimalDigit(this.source.charCodeAt(this.index))) { |
| this.throwUnexpectedToken(); |
| } |
| |
| return { |
| type: Token.NumericLiteral, |
| value: parseInt(num, 8), |
| octal: octal, |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| private isImplicitOctalLiteral(): boolean { |
| // Implicit octal, unless there is a non-octal digit. |
| // (Annex B.1.1 on Numeric Literals) |
| for (let i = this.index + 1; i < this.length; ++i) { |
| const ch = this.source[i]; |
| if (ch === '8' || ch === '9') { |
| return false; |
| } |
| if (!Character.isOctalDigit(ch.charCodeAt(0))) { |
| return true; |
| } |
| } |
| |
| return true; |
| } |
| |
| private scanNumericLiteral(): RawToken { |
| const start = this.index; |
| let ch = this.source[start]; |
| assert(Character.isDecimalDigit(ch.charCodeAt(0)) || (ch === '.'), |
| 'Numeric literal must start with a decimal digit or a decimal point'); |
| |
| let num = ''; |
| if (ch !== '.') { |
| num = this.source[this.index++]; |
| ch = this.source[this.index]; |
| |
| // Hex number starts with '0x'. |
| // Octal number starts with '0'. |
| // Octal number in ES6 starts with '0o'. |
| // Binary number in ES6 starts with '0b'. |
| if (num === '0') { |
| if (ch === 'x' || ch === 'X') { |
| ++this.index; |
| return this.scanHexLiteral(start); |
| } |
| if (ch === 'b' || ch === 'B') { |
| ++this.index; |
| return this.scanBinaryLiteral(start); |
| } |
| if (ch === 'o' || ch === 'O') { |
| return this.scanOctalLiteral(ch, start); |
| } |
| |
| if (ch && Character.isOctalDigit(ch.charCodeAt(0))) { |
| if (this.isImplicitOctalLiteral()) { |
| return this.scanOctalLiteral(ch, start); |
| } |
| } |
| } |
| |
| while (Character.isDecimalDigit(this.source.charCodeAt(this.index))) { |
| num += this.source[this.index++]; |
| } |
| ch = this.source[this.index]; |
| } |
| |
| if (ch === '.') { |
| num += this.source[this.index++]; |
| while (Character.isDecimalDigit(this.source.charCodeAt(this.index))) { |
| num += this.source[this.index++]; |
| } |
| ch = this.source[this.index]; |
| } |
| |
| if (ch === 'e' || ch === 'E') { |
| num += this.source[this.index++]; |
| |
| ch = this.source[this.index]; |
| if (ch === '+' || ch === '-') { |
| num += this.source[this.index++]; |
| } |
| if (Character.isDecimalDigit(this.source.charCodeAt(this.index))) { |
| while (Character.isDecimalDigit(this.source.charCodeAt(this.index))) { |
| num += this.source[this.index++]; |
| } |
| } else { |
| this.throwUnexpectedToken(); |
| } |
| } |
| |
| if (Character.isIdentifierStart(this.source.charCodeAt(this.index))) { |
| this.throwUnexpectedToken(); |
| } |
| |
| return { |
| type: Token.NumericLiteral, |
| value: parseFloat(num), |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-literals-string-literals |
| |
| private scanStringLiteral(): RawToken { |
| const start = this.index; |
| let quote = this.source[start]; |
| assert((quote === '\'' || quote === '"'), |
| 'String literal must starts with a quote'); |
| |
| ++this.index; |
| let octal = false; |
| let str = ''; |
| |
| while (!this.eof()) { |
| let ch = this.source[this.index++]; |
| |
| if (ch === quote) { |
| quote = ''; |
| break; |
| } else if (ch === '\\') { |
| ch = this.source[this.index++]; |
| if (!ch || !Character.isLineTerminator(ch.charCodeAt(0))) { |
| switch (ch) { |
| case 'u': |
| if (this.source[this.index] === '{') { |
| ++this.index; |
| str += this.scanUnicodeCodePointEscape(); |
| } else { |
| const unescapedChar = this.scanHexEscape(ch); |
| if (unescapedChar === null) { |
| this.throwUnexpectedToken(); |
| } |
| str += unescapedChar; |
| } |
| break; |
| case 'x': |
| const unescaped = this.scanHexEscape(ch); |
| if (unescaped === null) { |
| this.throwUnexpectedToken(Messages.InvalidHexEscapeSequence); |
| } |
| str += unescaped; |
| break; |
| case 'n': |
| str += '\n'; |
| break; |
| case 'r': |
| str += '\r'; |
| break; |
| case 't': |
| str += '\t'; |
| break; |
| case 'b': |
| str += '\b'; |
| break; |
| case 'f': |
| str += '\f'; |
| break; |
| case 'v': |
| str += '\x0B'; |
| break; |
| case '8': |
| case '9': |
| str += ch; |
| this.tolerateUnexpectedToken(); |
| break; |
| |
| default: |
| if (ch && Character.isOctalDigit(ch.charCodeAt(0))) { |
| const octToDec = this.octalToDecimal(ch); |
| |
| octal = octToDec.octal || octal; |
| str += String.fromCharCode(octToDec.code); |
| } else { |
| str += ch; |
| } |
| break; |
| } |
| } else { |
| ++this.lineNumber; |
| if (ch === '\r' && this.source[this.index] === '\n') { |
| ++this.index; |
| } |
| this.lineStart = this.index; |
| } |
| } else if (Character.isLineTerminator(ch.charCodeAt(0))) { |
| break; |
| } else { |
| str += ch; |
| } |
| } |
| |
| if (quote !== '') { |
| this.index = start; |
| this.throwUnexpectedToken(); |
| } |
| |
| return { |
| type: Token.StringLiteral, |
| value: str, |
| octal: octal, |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-template-literal-lexical-components |
| |
| private scanTemplate(): RawToken { |
| let cooked = ''; |
| let terminated = false; |
| const start = this.index; |
| |
| const head = (this.source[start] === '`'); |
| let tail = false; |
| let rawOffset = 2; |
| |
| ++this.index; |
| |
| while (!this.eof()) { |
| let ch = this.source[this.index++]; |
| if (ch === '`') { |
| rawOffset = 1; |
| tail = true; |
| terminated = true; |
| break; |
| } else if (ch === '$') { |
| if (this.source[this.index] === '{') { |
| this.curlyStack.push('${'); |
| ++this.index; |
| terminated = true; |
| break; |
| } |
| cooked += ch; |
| } else if (ch === '\\') { |
| ch = this.source[this.index++]; |
| if (!Character.isLineTerminator(ch.charCodeAt(0))) { |
| switch (ch) { |
| case 'n': |
| cooked += '\n'; |
| break; |
| case 'r': |
| cooked += '\r'; |
| break; |
| case 't': |
| cooked += '\t'; |
| break; |
| case 'u': |
| if (this.source[this.index] === '{') { |
| ++this.index; |
| cooked += this.scanUnicodeCodePointEscape(); |
| } else { |
| const restore = this.index; |
| const unescapedChar = this.scanHexEscape(ch); |
| if (unescapedChar !== null) { |
| cooked += unescapedChar; |
| } else { |
| this.index = restore; |
| cooked += ch; |
| } |
| } |
| break; |
| case 'x': |
| const unescaped = this.scanHexEscape(ch); |
| if (unescaped === null) { |
| this.throwUnexpectedToken(Messages.InvalidHexEscapeSequence); |
| } |
| cooked += unescaped; |
| break; |
| case 'b': |
| cooked += '\b'; |
| break; |
| case 'f': |
| cooked += '\f'; |
| break; |
| case 'v': |
| cooked += '\v'; |
| break; |
| |
| default: |
| if (ch === '0') { |
| if (Character.isDecimalDigit(this.source.charCodeAt(this.index))) { |
| // Illegal: \01 \02 and so on |
| this.throwUnexpectedToken(Messages.TemplateOctalLiteral); |
| } |
| cooked += '\0'; |
| } else if (Character.isOctalDigit(ch.charCodeAt(0))) { |
| // Illegal: \1 \2 |
| this.throwUnexpectedToken(Messages.TemplateOctalLiteral); |
| } else { |
| cooked += ch; |
| } |
| break; |
| } |
| } else { |
| ++this.lineNumber; |
| if (ch === '\r' && this.source[this.index] === '\n') { |
| ++this.index; |
| } |
| this.lineStart = this.index; |
| } |
| } else if (Character.isLineTerminator(ch.charCodeAt(0))) { |
| ++this.lineNumber; |
| if (ch === '\r' && this.source[this.index] === '\n') { |
| ++this.index; |
| } |
| this.lineStart = this.index; |
| cooked += '\n'; |
| } else { |
| cooked += ch; |
| } |
| } |
| |
| if (!terminated) { |
| this.throwUnexpectedToken(); |
| } |
| |
| if (!head) { |
| this.curlyStack.pop(); |
| } |
| |
| return { |
| type: Token.Template, |
| value: this.source.slice(start + 1, this.index - rawOffset), |
| cooked: cooked, |
| head: head, |
| tail: tail, |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| // https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals |
| |
| private testRegExp(pattern: string, flags: string): RegExp | null { |
| // The BMP character to use as a replacement for astral symbols when |
| // translating an ES6 "u"-flagged pattern to an ES5-compatible |
| // approximation. |
| // Note: replacing with '\uFFFF' enables false positives in unlikely |
| // scenarios. For example, `[\u{1044f}-\u{10440}]` is an invalid |
| // pattern that would not be detected by this substitution. |
| const astralSubstitute = '\uFFFF'; |
| let tmp = pattern; |
| const self = this; |
| |
| if (flags.indexOf('u') >= 0) { |
| tmp = tmp |
| // Replace every Unicode escape sequence with the equivalent |
| // BMP character or a constant ASCII code point in the case of |
| // astral symbols. (See the above note on `astralSubstitute` |
| // for more information.) |
| .replace(/\\u\{([0-9a-fA-F]+)\}|\\u([a-fA-F0-9]{4})/g, ($0, $1, $2) => { |
| const codePoint = parseInt($1 || $2, 16); |
| if (codePoint > 0x10FFFF) { |
| self.throwUnexpectedToken(Messages.InvalidRegExp); |
| } |
| if (codePoint <= 0xFFFF) { |
| return String.fromCharCode(codePoint); |
| } |
| return astralSubstitute; |
| }) |
| // Replace each paired surrogate with a single ASCII symbol to |
| // avoid throwing on regular expressions that are only valid in |
| // combination with the "u" flag. |
| .replace( |
| /[\uD800-\uDBFF][\uDC00-\uDFFF]/g, |
| astralSubstitute |
| ); |
| } |
| |
| // First, detect invalid regular expressions. |
| try { |
| RegExp(tmp); |
| } catch (e) { |
| this.throwUnexpectedToken(Messages.InvalidRegExp); |
| } |
| |
| // Return a regular expression object for this pattern-flag pair, or |
| // `null` in case the current environment doesn't support the flags it |
| // uses. |
| try { |
| return new RegExp(pattern, flags); |
| } catch (exception) { |
| /* istanbul ignore next */ |
| return null; |
| } |
| } |
| |
| private scanRegExpBody(): string { |
| let ch = this.source[this.index]; |
| assert(ch === '/', 'Regular expression literal must start with a slash'); |
| |
| let str = this.source[this.index++]; |
| let classMarker = false; |
| let terminated = false; |
| |
| while (!this.eof()) { |
| ch = this.source[this.index++]; |
| str += ch; |
| if (ch === '\\') { |
| ch = this.source[this.index++]; |
| // https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals |
| if (Character.isLineTerminator(ch.charCodeAt(0))) { |
| this.throwUnexpectedToken(Messages.UnterminatedRegExp); |
| } |
| str += ch; |
| } else if (Character.isLineTerminator(ch.charCodeAt(0))) { |
| this.throwUnexpectedToken(Messages.UnterminatedRegExp); |
| } else if (classMarker) { |
| if (ch === ']') { |
| classMarker = false; |
| } |
| } else { |
| if (ch === '/') { |
| terminated = true; |
| break; |
| } else if (ch === '[') { |
| classMarker = true; |
| } |
| } |
| } |
| |
| if (!terminated) { |
| this.throwUnexpectedToken(Messages.UnterminatedRegExp); |
| } |
| |
| // Exclude leading and trailing slash. |
| return str.substr(1, str.length - 2); |
| } |
| |
| private scanRegExpFlags(): string { |
| let str = ''; |
| let flags = ''; |
| while (!this.eof()) { |
| let ch = this.source[this.index]; |
| if (!Character.isIdentifierPart(ch.charCodeAt(0))) { |
| break; |
| } |
| |
| ++this.index; |
| if (ch === '\\' && !this.eof()) { |
| ch = this.source[this.index]; |
| if (ch === 'u') { |
| ++this.index; |
| let restore = this.index; |
| const char = this.scanHexEscape('u'); |
| if (char !== null) { |
| flags += char; |
| for (str += '\\u'; restore < this.index; ++restore) { |
| str += this.source[restore]; |
| } |
| } else { |
| this.index = restore; |
| flags += 'u'; |
| str += '\\u'; |
| } |
| this.tolerateUnexpectedToken(); |
| } else { |
| str += '\\'; |
| this.tolerateUnexpectedToken(); |
| } |
| } else { |
| flags += ch; |
| str += ch; |
| } |
| } |
| |
| return flags; |
| } |
| |
| public scanRegExp(): RawToken { |
| const start = this.index; |
| |
| const pattern = this.scanRegExpBody(); |
| const flags = this.scanRegExpFlags(); |
| const value = this.testRegExp(pattern, flags); |
| |
| return { |
| type: Token.RegularExpression, |
| value: '', |
| pattern: pattern, |
| flags: flags, |
| regex: value, |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: start, |
| end: this.index |
| }; |
| } |
| |
| public lex(): RawToken { |
| if (this.eof()) { |
| return { |
| type: Token.EOF, |
| value: '', |
| lineNumber: this.lineNumber, |
| lineStart: this.lineStart, |
| start: this.index, |
| end: this.index |
| }; |
| } |
| |
| const cp = this.source.charCodeAt(this.index); |
| |
| if (Character.isIdentifierStart(cp)) { |
| return this.scanIdentifier(); |
| } |
| |
| // Very common: ( and ) and ; |
| if (cp === 0x28 || cp === 0x29 || cp === 0x3B) { |
| return this.scanPunctuator(); |
| } |
| |
| // String literal starts with single quote (U+0027) or double quote (U+0022). |
| if (cp === 0x27 || cp === 0x22) { |
| return this.scanStringLiteral(); |
| } |
| |
| // Dot (.) U+002E can also start a floating-point number, hence the need |
| // to check the next character. |
| if (cp === 0x2E) { |
| if (Character.isDecimalDigit(this.source.charCodeAt(this.index + 1))) { |
| return this.scanNumericLiteral(); |
| } |
| return this.scanPunctuator(); |
| } |
| |
| if (Character.isDecimalDigit(cp)) { |
| return this.scanNumericLiteral(); |
| } |
| |
| // Template literals start with ` (U+0060) for template head |
| // or } (U+007D) for template middle or template tail. |
| if (cp === 0x60 || (cp === 0x7D && this.curlyStack[this.curlyStack.length - 1] === '${')) { |
| return this.scanTemplate(); |
| } |
| |
| // Possible identifier start in a surrogate pair. |
| if (cp >= 0xD800 && cp < 0xDFFF) { |
| if (Character.isIdentifierStart(this.codePointAt(this.index))) { |
| return this.scanIdentifier(); |
| } |
| } |
| |
| return this.scanPunctuator(); |
| } |
| |
| } |