src/v8/src/regexp/gen-regexp-special-case.cc - cobalt - Git at Google

 // Copyright 2019 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <sstream>

 #include "src/base/logging.h"
 #include "unicode/uchar.h"
 #include "unicode/uniset.h"

 namespace v8 {
 namespace internal {

 // The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
 // functions into "src/regexp/special-case.cc".
 // See more details in http://shorturl.at/adfO5
 void PrintSet(std::ofstream& out, const char* func_name,
               const icu::UnicodeSet& set) {
   out << "icu::UnicodeSet " << func_name << "() {\n"
       << "  icu::UnicodeSet set;\n";
   for (int32_t i = 0; i < set.getRangeCount(); i++) {
     if (set.getRangeStart(i) == set.getRangeEnd(i)) {
       out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
     } else {
       out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
           << set.getRangeEnd(i) << ");\n";
     }
   }
   out << "  set.freeze();\n"
       << "  return set;\n"
       << "}\n";
 }

 void PrintSpecial(std::ofstream& out) {
   icu::UnicodeSet current;
   icu::UnicodeSet processed(0xd800, 0xdbff);  // Ignore surrogate range.
   icu::UnicodeSet special_add;
   icu::UnicodeSet ignore;
   UErrorCode status = U_ZERO_ERROR;
   icu::UnicodeSet upper("[\\p{Lu}]", status);
   CHECK(U_SUCCESS(status));
   // Iterate through all chars in BMP except ASCII and Surrogate.
   for (UChar32 i = 0x80; i < 0x010000; i++) {
     // Ignore those characters which is already processed.
     if (!processed.contains(i)) {
       current.set(i, i);
       current.closeOver(USET_CASE_INSENSITIVE);

       // Remember we already processed current.
       processed.addAll(current);

       // All uppercase characters in current.
       icu::UnicodeSet keep_upper(current);
       keep_upper.retainAll(upper);

       // Check if we have more than one uppercase character in current.
       // If there are more than one uppercase character, then it is a special
       // set which need to be added into either "Special Add" set or "Ignore"
       // set.
       int32_t number_of_upper = 0;
       for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
         number_of_upper +=
             keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
       }
       if (number_of_upper > 1) {
         // Add all non uppercase characters (could be Ll or Mn) to special add
         // set.
         current.removeAll(upper);
         special_add.addAll(current);

         // Add the uppercase characters of non uppercase character to
         // special add set.
         CHECK_GT(current.getRangeCount(), 0);
         UChar32 main_upper = u_toupper(current.getRangeStart(0));
         special_add.add(main_upper);

         // Add all uppercase except the main upper to ignore set.
         keep_upper.remove(main_upper);
         ignore.addAll(keep_upper);
       }
     }
   }

   // Remove any ASCII
   special_add.remove(0x0000, 0x007f);
   PrintSet(out, "BuildIgnoreSet", ignore);
   PrintSet(out, "BuildSpecialAddSet", special_add);
 }

 void WriteHeader(const char* header_filename) {
   std::ofstream out(header_filename);
   out << std::hex << std::setfill('0') << std::setw(4);

   out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
       << "// The following functions are used to build icu::UnicodeSet\n"
       << "// for specical cases different between Unicode and ECMA262.\n"
       << "#ifdef V8_INTL_SUPPORT\n"
       << "#include \"src/regexp/special-case.h\"\n\n"
       << "#include \"unicode/uniset.h\"\n"
       << "namespace v8 {\n"
       << "namespace internal {\n\n";

   PrintSpecial(out);

   out << "\n"
       << "}  // namespace internal\n"
       << "}  // namespace v8\n"
       << "#endif  // V8_INTL_SUPPORT\n";
 }

 }  // namespace internal
 }  // namespace v8

 int main(int argc, const char** argv) {
   if (argc != 2) {
     std::cerr << "Usage: " << argv[0] << " <output filename>\n";
     std::exit(1);
   }
   v8::internal::WriteHeader(argv[1]);

   return 0;
 }
	// Copyright 2019 the V8 project authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <fstream>
	#include <iomanip>
	#include <iostream>
	#include <sstream>

	#include "src/base/logging.h"
	#include "unicode/uchar.h"
	#include "unicode/uniset.h"

	namespace v8 {
	namespace internal {

	// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
	// functions into "src/regexp/special-case.cc".
	// See more details in http://shorturl.at/adfO5
	void PrintSet(std::ofstream& out, const char* func_name,
	const icu::UnicodeSet& set) {
	out << "icu::UnicodeSet " << func_name << "() {\n"
	<< " icu::UnicodeSet set;\n";
	for (int32_t i = 0; i < set.getRangeCount(); i++) {
	if (set.getRangeStart(i) == set.getRangeEnd(i)) {
	out << " set.add(0x" << set.getRangeStart(i) << ");\n";
	} else {
	out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
	<< set.getRangeEnd(i) << ");\n";
	}
	}
	out << " set.freeze();\n"
	<< " return set;\n"
	<< "}\n";
	}

	void PrintSpecial(std::ofstream& out) {
	icu::UnicodeSet current;
	icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
	icu::UnicodeSet special_add;
	icu::UnicodeSet ignore;
	UErrorCode status = U_ZERO_ERROR;
	icu::UnicodeSet upper("[\\p{Lu}]", status);
	CHECK(U_SUCCESS(status));
	// Iterate through all chars in BMP except ASCII and Surrogate.
	for (UChar32 i = 0x80; i < 0x010000; i++) {
	// Ignore those characters which is already processed.
	if (!processed.contains(i)) {
	current.set(i, i);
	current.closeOver(USET_CASE_INSENSITIVE);

	// Remember we already processed current.
	processed.addAll(current);

	// All uppercase characters in current.
	icu::UnicodeSet keep_upper(current);
	keep_upper.retainAll(upper);

	// Check if we have more than one uppercase character in current.
	// If there are more than one uppercase character, then it is a special
	// set which need to be added into either "Special Add" set or "Ignore"
	// set.
	int32_t number_of_upper = 0;
	for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
	number_of_upper +=
	keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
	}
	if (number_of_upper > 1) {
	// Add all non uppercase characters (could be Ll or Mn) to special add
	// set.
	current.removeAll(upper);
	special_add.addAll(current);

	// Add the uppercase characters of non uppercase character to
	// special add set.
	CHECK_GT(current.getRangeCount(), 0);
	UChar32 main_upper = u_toupper(current.getRangeStart(0));
	special_add.add(main_upper);

	// Add all uppercase except the main upper to ignore set.
	keep_upper.remove(main_upper);
	ignore.addAll(keep_upper);
	}
	}
	}

	// Remove any ASCII
	special_add.remove(0x0000, 0x007f);
	PrintSet(out, "BuildIgnoreSet", ignore);
	PrintSet(out, "BuildSpecialAddSet", special_add);
	}

	void WriteHeader(const char* header_filename) {
	std::ofstream out(header_filename);
	out << std::hex << std::setfill('0') << std::setw(4);

	out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
	<< "// The following functions are used to build icu::UnicodeSet\n"
	<< "// for specical cases different between Unicode and ECMA262.\n"
	<< "#ifdef V8_INTL_SUPPORT\n"
	<< "#include \"src/regexp/special-case.h\"\n\n"
	<< "#include \"unicode/uniset.h\"\n"
	<< "namespace v8 {\n"
	<< "namespace internal {\n\n";

	PrintSpecial(out);

	out << "\n"
	<< "} // namespace internal\n"
	<< "} // namespace v8\n"
	<< "#endif // V8_INTL_SUPPORT\n";
	}

	} // namespace internal
	} // namespace v8

	int main(int argc, const char** argv) {
	if (argc != 2) {
	std::cerr << "Usage: " << argv[0] << " <output filename>\n";
	std::exit(1);
	}
	v8::internal::WriteHeader(argv[1]);

	return 0;
	}