| #!/usr/bin/env python |
| # Copyright 2018 the V8 project authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import os |
| import sys |
| import subprocess |
| import re |
| import math |
| |
| INPUT_PATH = "src/parsing/keywords.txt" |
| OUTPUT_PATH = "src/parsing/keywords-gen.h" |
| |
| # TODO(leszeks): Trimming seems to regress performance, investigate. |
| TRIM_CHAR_TABLE = False |
| |
| |
| def next_power_of_2(x): |
| return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2))) |
| |
| |
| def call_with_input(cmd, input_string=""): |
| p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) |
| stdout, _ = p.communicate(input_string) |
| retcode = p.wait() |
| if retcode != 0: |
| raise subprocess.CalledProcessError(retcode, cmd) |
| return stdout |
| |
| |
| def checked_sub(pattern, sub, out, count=1, flags=0): |
| out, n = re.subn(pattern, sub, out, flags=flags) |
| if n != count: |
| raise Exception("Didn't get exactly %d replacement(s) for pattern: %s" % |
| (count, pattern)) |
| return out |
| |
| |
| def change_sizet_to_int(out): |
| # Literal buffer lengths are given as ints, not size_t |
| return checked_sub(r'\bsize_t\b', 'int', out, count=4) |
| |
| |
| def drop_line_directives(out): |
| # #line causes gcov issue, so drop it |
| return re.sub(r'^#\s*line .*$\n', '', out, flags=re.MULTILINE) |
| |
| |
| def trim_and_dcheck_char_table(out): |
| # Potential keyword strings are known to be lowercase ascii, so chop off the |
| # rest of the table and mask out the char |
| |
| reads_re = re.compile( |
| r'asso_values\[static_cast<unsigned char>\(str\[(\d+)\]\)\]') |
| |
| dchecks = [] |
| for str_read in reads_re.finditer(out): |
| dchecks.append("DCHECK_LT(str[%d], 128);" % int(str_read.group(1))) |
| |
| if TRIM_CHAR_TABLE: |
| out = checked_sub( |
| r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}', |
| "".join(dchecks) + r'static const unsigned char asso_values[32] = {', |
| out, |
| flags=re.MULTILINE) |
| out = checked_sub( |
| reads_re.pattern, |
| r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]', |
| out, |
| count=len(dchecks), |
| flags=re.MULTILINE) |
| else: |
| out = checked_sub( |
| r'static const unsigned char asso_values\[\]\s*=\s*\{', |
| "".join(dchecks) + r'static const unsigned char asso_values[128] = {', |
| out, |
| flags=re.MULTILINE) |
| |
| return out |
| |
| |
| def use_isinrange(out): |
| # Our IsInRange method is more efficient than checking for min/max length |
| return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)', |
| r'if (IsInRange(len, MIN_WORD_LENGTH, MAX_WORD_LENGTH))', |
| out) |
| |
| |
| def pad_tables(out): |
| # We don't want to compare against the max hash value, so pad the tables up |
| # to a power of two and mask the hash. |
| |
| # First get the new size |
| max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1)) |
| old_table_length = max_hash_value + 1 |
| new_table_length = next_power_of_2(old_table_length) |
| table_padding_len = new_table_length - old_table_length |
| |
| # Pad the length table. |
| single_lengthtable_entry = r'\d+' |
| out = checked_sub( |
| r""" |
| static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{ |
| ( |
| \s*%(single_lengthtable_entry)s\s* |
| (?:,\s*%(single_lengthtable_entry)s\s*)* |
| ) |
| \} |
| """ % {'single_lengthtable_entry': single_lengthtable_entry}, |
| r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }' |
| % (new_table_length, "".join([',0'] * table_padding_len)), |
| out, |
| flags=re.MULTILINE | re.VERBOSE) |
| |
| # Pad the word list. |
| single_wordlist_entry = r""" |
| (?:\#line\ \d+\ ".*"$\s*)? |
| \{\s*"[a-z]*"\s*,\s*Token::[A-Z_]+\} |
| """ |
| out = checked_sub( |
| r""" |
| static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{ |
| ( |
| \s*%(single_wordlist_entry)s\s* |
| (?:,\s*%(single_wordlist_entry)s\s*)* |
| ) |
| \} |
| """ % {'single_wordlist_entry': single_wordlist_entry}, |
| r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }' |
| % (new_table_length, "".join( |
| [',{"",Token::IDENTIFIER}'] * table_padding_len)), |
| out, |
| flags=re.MULTILINE | re.VERBOSE) |
| |
| # Mask the hash and replace the range check with DCHECKs. |
| out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)', |
| r'Hash(str, len)&0x%x' % (new_table_length - 1), out) |
| out = checked_sub( |
| r'if \(key <= MAX_HASH_VALUE\)', |
| r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));', |
| out) |
| |
| return out |
| |
| |
| def return_token(out): |
| # We want to return the actual token rather than the table entry. |
| |
| # Change the return type of the function. Make it inline too. |
| out = checked_sub( |
| r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)', |
| r'inline Token::Value \1', |
| out, |
| count=2) |
| |
| # Change the return value when the keyword is found |
| out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];', |
| r'return kPerfectKeywordHashTable[key].value;', out) |
| |
| # Change the return value when the keyword is not found |
| out = checked_sub(r'return 0;', r'return Token::IDENTIFIER;', out) |
| |
| return out |
| |
| |
| def memcmp_to_while(out): |
| # It's faster to loop over the keyword with a while loop than calling memcmp. |
| # Careful, this replacement is quite flaky, because otherwise the regex is |
| # unreadable. |
| return checked_sub( |
| re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") + r"\s*" |
| + re.escape("return kPerfectKeywordHashTable[key].value;"), |
| """ |
| while(*s!=0) { |
| if (*s++ != *str++) return Token::IDENTIFIER; |
| } |
| return kPerfectKeywordHashTable[key].value; |
| """, |
| out, |
| flags=re.MULTILINE) |
| |
| |
| def wrap_namespace(out): |
| return """// Copyright 2018 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // This file is automatically generated by gen-keywords-gen-h.py and should not |
| // be modified manually. |
| |
| #ifndef V8_PARSING_KEYWORDS_GEN_H_ |
| #define V8_PARSING_KEYWORDS_GEN_H_ |
| |
| #include "src/parsing/token.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| %s |
| |
| } // namespace internal |
| } // namespace v8 |
| |
| #endif // V8_PARSING_KEYWORDS_GEN_H_ |
| """ % (out) |
| |
| |
| def trim_character_set_warning(out): |
| # gperf generates an error message that is too large, trim it |
| |
| return out.replace( |
| '"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."', |
| '"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.' |
| ) |
| |
| |
| def main(): |
| try: |
| script_dir = os.path.dirname(sys.argv[0]) |
| root_dir = os.path.join(script_dir, '..') |
| |
| out = subprocess.check_output(["gperf", "-m100", INPUT_PATH], cwd=root_dir) |
| |
| # And now some munging of the generated file. |
| out = change_sizet_to_int(out) |
| out = drop_line_directives(out) |
| out = trim_and_dcheck_char_table(out) |
| out = use_isinrange(out) |
| out = pad_tables(out) |
| out = return_token(out) |
| out = memcmp_to_while(out) |
| out = wrap_namespace(out) |
| out = trim_character_set_warning(out) |
| |
| # Final formatting. |
| clang_format_path = os.path.join(root_dir, |
| 'third_party/depot_tools/clang-format') |
| out = call_with_input([clang_format_path], out) |
| |
| with open(os.path.join(root_dir, OUTPUT_PATH), 'w') as f: |
| f.write(out) |
| |
| return 0 |
| |
| except subprocess.CalledProcessError as e: |
| sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd))) |
| return e.returncode |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main()) |