blob: 9ed338fc1d8b331518dc867efd5f55e25dad1c5e [file] [log] [blame]
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include "src/regexp/special-case.h"
namespace v8 {
namespace internal {
static const uc32 kSurrogateStart = 0xd800;
static const uc32 kSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
// The following code generates "src/regexp/special-case.cc".
void PrintSet(std::ofstream& out, const char* name,
const icu::UnicodeSet& set) {
out << "icu::UnicodeSet Build" << name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
out << " set.add(0x" << set.getRangeStart(i) << ");\n";
} else {
out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
<< set.getRangeEnd(i) << ");\n";
}
}
out << " set.freeze();\n"
<< " return set;\n"
<< "}\n\n";
out << "struct " << name << "Data {\n"
<< " " << name << "Data() : set(Build" << name << "()) {}\n"
<< " const icu::UnicodeSet set;\n"
<< "};\n\n";
out << "//static\n"
<< "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
<< " static base::LazyInstance<" << name << "Data>::type set =\n"
<< " LAZY_INSTANCE_INITIALIZER;\n"
<< " return set.Pointer()->set;\n"
<< "}\n\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
// Iterate through all chars in BMP except surrogates.
for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
if (i >= static_cast<UChar32>(kSurrogateStart) &&
i <= static_cast<UChar32>(kSurrogateEnd)) {
continue; // Ignore surrogate range
}
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Check to see if all characters in the case-folding equivalence
// class as defined by UnicodeSet::closeOver all map to the same
// canonical value.
UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
bool class_has_matching_canonical_char = false;
bool class_has_non_matching_canonical_char = false;
for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
c++) {
if (c == i) {
continue;
}
UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
if (canonical == other_canonical) {
class_has_matching_canonical_char = true;
} else {
class_has_non_matching_canonical_char = true;
}
}
}
// If any other character in i's equivalence class has a
// different canonical value, then i needs special handling. If
// no other character shares a canonical value with i, we can
// ignore i when adding alternatives for case-independent
// comparison. If at least one other character shares a
// canonical value, then i needs special handling.
if (class_has_non_matching_canonical_char) {
if (class_has_matching_canonical_char) {
special_add.add(i);
} else {
ignore.add(i);
}
}
}
// Verify that no Unicode equivalence class contains two non-trivial
// JS equivalence classes. Every character in SpecialAddSet has the
// same canonical value as every other non-IgnoreSet character in
// its Unicode equivalence class. Therefore, if we call closeOver on
// a set containing no IgnoreSet characters, the only characters
// that must be removed from the result are in IgnoreSet. This fact
// is used in CharacterRange::AddCaseEquivalents.
for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
for (UChar32 c = special_add.getRangeStart(i);
c <= special_add.getRangeEnd(i); c++) {
UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
current.set(c, c);
current.closeOver(USET_CASE_INSENSITIVE);
current.removeAll(ignore);
for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c2 = current.getRangeStart(j);
c2 <= current.getRangeEnd(j); c2++) {
CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
}
}
}
}
PrintSet(out, "IgnoreSet", ignore);
PrintSet(out, "SpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
<< "// Use of this source code is governed by a BSD-style license that\n"
<< "// can be found in the LICENSE file.\n\n"
<< "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
<< "// The following functions are used to build UnicodeSets\n"
<< "// for special cases where the case-folding algorithm used by\n"
<< "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
<< "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
<< "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n"
<< "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"
<< "namespace internal {\n\n";
PrintSpecial(out);
out << "\n"
<< "} // namespace internal\n"
<< "} // namespace v8\n"
<< "#endif // V8_INTL_SUPPORT\n";
}
} // namespace internal
} // namespace v8
int main(int argc, const char** argv) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <output filename>\n";
std::exit(1);
}
v8::internal::WriteHeader(argv[1]);
return 0;
}