| // Copyright 2020 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include <fstream> |
| #include <iomanip> |
| #include <iostream> |
| #include <sstream> |
| |
| #include "src/regexp/special-case.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| static const uc32 kSurrogateStart = 0xd800; |
| static const uc32 kSurrogateEnd = 0xdfff; |
| static const uc32 kNonBmpStart = 0x10000; |
| |
| // The following code generates "src/regexp/special-case.cc". |
| void PrintSet(std::ofstream& out, const char* name, |
| const icu::UnicodeSet& set) { |
| out << "icu::UnicodeSet Build" << name << "() {\n" |
| << " icu::UnicodeSet set;\n"; |
| for (int32_t i = 0; i < set.getRangeCount(); i++) { |
| if (set.getRangeStart(i) == set.getRangeEnd(i)) { |
| out << " set.add(0x" << set.getRangeStart(i) << ");\n"; |
| } else { |
| out << " set.add(0x" << set.getRangeStart(i) << ", 0x" |
| << set.getRangeEnd(i) << ");\n"; |
| } |
| } |
| out << " set.freeze();\n" |
| << " return set;\n" |
| << "}\n\n"; |
| |
| out << "struct " << name << "Data {\n" |
| << " " << name << "Data() : set(Build" << name << "()) {}\n" |
| << " const icu::UnicodeSet set;\n" |
| << "};\n\n"; |
| |
| out << "//static\n" |
| << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" |
| << " static base::LazyInstance<" << name << "Data>::type set =\n" |
| << " LAZY_INSTANCE_INITIALIZER;\n" |
| << " return set.Pointer()->set;\n" |
| << "}\n\n"; |
| } |
| |
| void PrintSpecial(std::ofstream& out) { |
| icu::UnicodeSet current; |
| icu::UnicodeSet special_add; |
| icu::UnicodeSet ignore; |
| UErrorCode status = U_ZERO_ERROR; |
| icu::UnicodeSet upper("[\\p{Lu}]", status); |
| CHECK(U_SUCCESS(status)); |
| |
| // Iterate through all chars in BMP except surrogates. |
| for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { |
| if (i >= static_cast<UChar32>(kSurrogateStart) && |
| i <= static_cast<UChar32>(kSurrogateEnd)) { |
| continue; // Ignore surrogate range |
| } |
| current.set(i, i); |
| current.closeOver(USET_CASE_INSENSITIVE); |
| |
| // Check to see if all characters in the case-folding equivalence |
| // class as defined by UnicodeSet::closeOver all map to the same |
| // canonical value. |
| UChar32 canonical = RegExpCaseFolding::Canonicalize(i); |
| bool class_has_matching_canonical_char = false; |
| bool class_has_non_matching_canonical_char = false; |
| for (int32_t j = 0; j < current.getRangeCount(); j++) { |
| for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); |
| c++) { |
| if (c == i) { |
| continue; |
| } |
| UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); |
| if (canonical == other_canonical) { |
| class_has_matching_canonical_char = true; |
| } else { |
| class_has_non_matching_canonical_char = true; |
| } |
| } |
| } |
| // If any other character in i's equivalence class has a |
| // different canonical value, then i needs special handling. If |
| // no other character shares a canonical value with i, we can |
| // ignore i when adding alternatives for case-independent |
| // comparison. If at least one other character shares a |
| // canonical value, then i needs special handling. |
| if (class_has_non_matching_canonical_char) { |
| if (class_has_matching_canonical_char) { |
| special_add.add(i); |
| } else { |
| ignore.add(i); |
| } |
| } |
| } |
| |
| // Verify that no Unicode equivalence class contains two non-trivial |
| // JS equivalence classes. Every character in SpecialAddSet has the |
| // same canonical value as every other non-IgnoreSet character in |
| // its Unicode equivalence class. Therefore, if we call closeOver on |
| // a set containing no IgnoreSet characters, the only characters |
| // that must be removed from the result are in IgnoreSet. This fact |
| // is used in CharacterRange::AddCaseEquivalents. |
| for (int32_t i = 0; i < special_add.getRangeCount(); i++) { |
| for (UChar32 c = special_add.getRangeStart(i); |
| c <= special_add.getRangeEnd(i); c++) { |
| UChar32 canonical = RegExpCaseFolding::Canonicalize(c); |
| current.set(c, c); |
| current.closeOver(USET_CASE_INSENSITIVE); |
| current.removeAll(ignore); |
| for (int32_t j = 0; j < current.getRangeCount(); j++) { |
| for (UChar32 c2 = current.getRangeStart(j); |
| c2 <= current.getRangeEnd(j); c2++) { |
| CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); |
| } |
| } |
| } |
| } |
| |
| PrintSet(out, "IgnoreSet", ignore); |
| PrintSet(out, "SpecialAddSet", special_add); |
| } |
| |
| void WriteHeader(const char* header_filename) { |
| std::ofstream out(header_filename); |
| out << std::hex << std::setfill('0') << std::setw(4); |
| out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" |
| << "// Use of this source code is governed by a BSD-style license that\n" |
| << "// can be found in the LICENSE file.\n\n" |
| << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" |
| << "// The following functions are used to build UnicodeSets\n" |
| << "// for special cases where the case-folding algorithm used by\n" |
| << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" |
| << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" |
| << "// Semantics: Canonicalize) step 3.\n\n" |
| << "#ifdef V8_INTL_SUPPORT\n" |
| << "#include \"src/base/lazy-instance.h\"\n\n" |
| << "#include \"src/regexp/special-case.h\"\n\n" |
| << "#include \"unicode/uniset.h\"\n" |
| << "namespace v8 {\n" |
| << "namespace internal {\n\n"; |
| |
| PrintSpecial(out); |
| |
| out << "\n" |
| << "} // namespace internal\n" |
| << "} // namespace v8\n" |
| << "#endif // V8_INTL_SUPPORT\n"; |
| } |
| |
| } // namespace internal |
| } // namespace v8 |
| |
| int main(int argc, const char** argv) { |
| if (argc != 2) { |
| std::cerr << "Usage: " << argv[0] << " <output filename>\n"; |
| std::exit(1); |
| } |
| v8::internal::WriteHeader(argv[1]); |
| |
| return 0; |
| } |