blob: 8aace6ab888d75a749d9d5f49972366607ea3ba4 [file] [log] [blame]
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include "src/base/logging.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
namespace v8 {
namespace internal {
// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
// functions into "src/regexp/special-case.cc".
// See more details in http://shorturl.at/adfO5
void PrintSet(std::ofstream& out, const char* func_name,
const icu::UnicodeSet& set) {
out << "icu::UnicodeSet " << func_name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
out << " set.add(0x" << set.getRangeStart(i) << ");\n";
} else {
out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
<< set.getRangeEnd(i) << ");\n";
}
}
out << " set.freeze();\n"
<< " return set;\n"
<< "}\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
// Iterate through all chars in BMP except ASCII and Surrogate.
for (UChar32 i = 0x80; i < 0x010000; i++) {
// Ignore those characters which is already processed.
if (!processed.contains(i)) {
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Remember we already processed current.
processed.addAll(current);
// All uppercase characters in current.
icu::UnicodeSet keep_upper(current);
keep_upper.retainAll(upper);
// Check if we have more than one uppercase character in current.
// If there are more than one uppercase character, then it is a special
// set which need to be added into either "Special Add" set or "Ignore"
// set.
int32_t number_of_upper = 0;
for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
number_of_upper +=
keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
}
if (number_of_upper > 1) {
// Add all non uppercase characters (could be Ll or Mn) to special add
// set.
current.removeAll(upper);
special_add.addAll(current);
// Add the uppercase characters of non uppercase character to
// special add set.
CHECK_GT(current.getRangeCount(), 0);
UChar32 main_upper = u_toupper(current.getRangeStart(0));
special_add.add(main_upper);
// Add all uppercase except the main upper to ignore set.
keep_upper.remove(main_upper);
ignore.addAll(keep_upper);
}
}
}
// Remove any ASCII
special_add.remove(0x0000, 0x007f);
PrintSet(out, "BuildIgnoreSet", ignore);
PrintSet(out, "BuildSpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
<< "// The following functions are used to build icu::UnicodeSet\n"
<< "// for specical cases different between Unicode and ECMA262.\n"
<< "#ifdef V8_INTL_SUPPORT\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"
<< "namespace internal {\n\n";
PrintSpecial(out);
out << "\n"
<< "} // namespace internal\n"
<< "} // namespace v8\n"
<< "#endif // V8_INTL_SUPPORT\n";
}
} // namespace internal
} // namespace v8
int main(int argc, const char** argv) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <output filename>\n";
std::exit(1);
}
v8::internal::WriteHeader(argv[1]);
return 0;
}