| // Copyright 2019 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include <fstream> |
| #include <iomanip> |
| #include <iostream> |
| #include <sstream> |
| |
| #include "src/base/logging.h" |
| #include "unicode/uchar.h" |
| #include "unicode/uniset.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| // The following code generates BuildSpecialAddSet() and BuildIgnoreSet() |
| // functions into "src/regexp/special-case.cc". |
| // See more details in http://shorturl.at/adfO5 |
| void PrintSet(std::ofstream& out, const char* func_name, |
| const icu::UnicodeSet& set) { |
| out << "icu::UnicodeSet " << func_name << "() {\n" |
| << " icu::UnicodeSet set;\n"; |
| for (int32_t i = 0; i < set.getRangeCount(); i++) { |
| if (set.getRangeStart(i) == set.getRangeEnd(i)) { |
| out << " set.add(0x" << set.getRangeStart(i) << ");\n"; |
| } else { |
| out << " set.add(0x" << set.getRangeStart(i) << ", 0x" |
| << set.getRangeEnd(i) << ");\n"; |
| } |
| } |
| out << " set.freeze();\n" |
| << " return set;\n" |
| << "}\n"; |
| } |
| |
| void PrintSpecial(std::ofstream& out) { |
| icu::UnicodeSet current; |
| icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range. |
| icu::UnicodeSet special_add; |
| icu::UnicodeSet ignore; |
| UErrorCode status = U_ZERO_ERROR; |
| icu::UnicodeSet upper("[\\p{Lu}]", status); |
| CHECK(U_SUCCESS(status)); |
| // Iterate through all chars in BMP except ASCII and Surrogate. |
| for (UChar32 i = 0x80; i < 0x010000; i++) { |
| // Ignore those characters which is already processed. |
| if (!processed.contains(i)) { |
| current.set(i, i); |
| current.closeOver(USET_CASE_INSENSITIVE); |
| |
| // Remember we already processed current. |
| processed.addAll(current); |
| |
| // All uppercase characters in current. |
| icu::UnicodeSet keep_upper(current); |
| keep_upper.retainAll(upper); |
| |
| // Check if we have more than one uppercase character in current. |
| // If there are more than one uppercase character, then it is a special |
| // set which need to be added into either "Special Add" set or "Ignore" |
| // set. |
| int32_t number_of_upper = 0; |
| for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { |
| number_of_upper += |
| keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; |
| } |
| if (number_of_upper > 1) { |
| // Add all non uppercase characters (could be Ll or Mn) to special add |
| // set. |
| current.removeAll(upper); |
| special_add.addAll(current); |
| |
| // Add the uppercase characters of non uppercase character to |
| // special add set. |
| CHECK_GT(current.getRangeCount(), 0); |
| UChar32 main_upper = u_toupper(current.getRangeStart(0)); |
| special_add.add(main_upper); |
| |
| // Add all uppercase except the main upper to ignore set. |
| keep_upper.remove(main_upper); |
| ignore.addAll(keep_upper); |
| } |
| } |
| } |
| |
| // Remove any ASCII |
| special_add.remove(0x0000, 0x007f); |
| PrintSet(out, "BuildIgnoreSet", ignore); |
| PrintSet(out, "BuildSpecialAddSet", special_add); |
| } |
| |
| void WriteHeader(const char* header_filename) { |
| std::ofstream out(header_filename); |
| out << std::hex << std::setfill('0') << std::setw(4); |
| |
| out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" |
| << "// The following functions are used to build icu::UnicodeSet\n" |
| << "// for specical cases different between Unicode and ECMA262.\n" |
| << "#ifdef V8_INTL_SUPPORT\n" |
| << "#include \"src/regexp/special-case.h\"\n\n" |
| << "#include \"unicode/uniset.h\"\n" |
| << "namespace v8 {\n" |
| << "namespace internal {\n\n"; |
| |
| PrintSpecial(out); |
| |
| out << "\n" |
| << "} // namespace internal\n" |
| << "} // namespace v8\n" |
| << "#endif // V8_INTL_SUPPORT\n"; |
| } |
| |
| } // namespace internal |
| } // namespace v8 |
| |
| int main(int argc, const char** argv) { |
| if (argc != 2) { |
| std::cerr << "Usage: " << argv[0] << " <output filename>\n"; |
| std::exit(1); |
| } |
| v8::internal::WriteHeader(argv[1]); |
| |
| return 0; |
| } |