src/v8/src/regexp/special-case.h - cobalt - Git at Google

 // Copyright 2019 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef V8_REGEXP_SPECIAL_CASE_H_
 #define V8_REGEXP_SPECIAL_CASE_H_

 #ifdef V8_INTL_SUPPORT
 #include "unicode/uversion.h"
 namespace U_ICU_NAMESPACE {
 class UnicodeSet;
 }  //  namespace U_ICU_NAMESPACE

 namespace v8 {
 namespace internal {

 // Functions to build special sets of Unicode characters that need special
 // handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
 //
 // For the characters in the "ignore set", the process should not treat other
 // characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
 // equivlant under the ECMA262 RegExp "i" mode because these characters are
 // uppercase themselves that no other characters in the set uppercase to.
 //
 // For the characters in the "special add set", the proecess should add only
 // those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
 // not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
 // and also that ONE uppercase character that other non uppercase character
 // uppercase into to the set. Other uppercase characters in the result of
 // closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
 // RegExp "i" mode consider two characters as "case equivlant" if both
 // characters uppercase to the same character.
 //
 // For example, consider the following case equivalent set defined by Unicode
 // standard. Notice there are more than one uppercase characters in this set:
 //  U+212B Å Angstrom Sign - an uppercase character.
 //  U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
 //  U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
 //    uppercase to U+00C5.
 // In this case equivlant set is a special set and need special handling while
 // considering "case equivlant" under the ECMA262 RegExp "i" mode which is
 // different than Unicode Standard:
 //  * U+212B should be included into the "ignore" set because there are no other
 //    characters, under the ECMA262 "i" mode, are considered as "case equivlant"
 //    to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
 //    uppercase to U+212B.
 //  * U+00C5 and U+00E5 will both be included into the "special add" set. While
 //    calculate the "equivlant set" under ECMA262 "i" mode, the process will
 //    add U+00E5, because it is not an uppercase character in the set. The
 //    process will also add U+00C5, because it is the uppercase character which
 //    other non uppercase character, U+00C5, uppercase into.
 //
 // For characters not included in "ignore set" and "special add set", the
 // process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
 // much faster.
 //
 // Under Unicode 12.0, there are only 7 characters in the "special add set" and
 // 4 characters in "ignore set" so even the special add process is slower, it is
 // limited to a small set of cases only.
 //
 // The implementation of these two function will be generated by calling ICU
 // icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
 // the code in src/regexp/gen-regexp-special-case.cc.
 //
 // These two function will be used with LazyInstance<> template to generate
 // global sharable set to reduce memory usage and speed up performance.

 // Function to build and return the Ignore set.
 icu::UnicodeSet BuildIgnoreSet();

 // Function to build and return the Special Add set.
 icu::UnicodeSet BuildSpecialAddSet();

 }  // namespace internal
 }  // namespace v8

 #endif  // V8_INTL_SUPPORT

 #endif  // V8_REGEXP_SPECIAL_CASE_H_
	// Copyright 2019 the V8 project authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef V8_REGEXP_SPECIAL_CASE_H_
	#define V8_REGEXP_SPECIAL_CASE_H_

	#ifdef V8_INTL_SUPPORT
	#include "unicode/uversion.h"
	namespace U_ICU_NAMESPACE {
	class UnicodeSet;
	} // namespace U_ICU_NAMESPACE

	namespace v8 {
	namespace internal {

	// Functions to build special sets of Unicode characters that need special
	// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
	//
	// For the characters in the "ignore set", the process should not treat other
	// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
	// equivlant under the ECMA262 RegExp "i" mode because these characters are
	// uppercase themselves that no other characters in the set uppercase to.
	//
	// For the characters in the "special add set", the proecess should add only
	// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
	// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
	// and also that ONE uppercase character that other non uppercase character
	// uppercase into to the set. Other uppercase characters in the result of
	// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
	// RegExp "i" mode consider two characters as "case equivlant" if both
	// characters uppercase to the same character.
	//
	// For example, consider the following case equivalent set defined by Unicode
	// standard. Notice there are more than one uppercase characters in this set:
	// U+212B Å Angstrom Sign - an uppercase character.
	// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
	// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
	// uppercase to U+00C5.
	// In this case equivlant set is a special set and need special handling while
	// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
	// different than Unicode Standard:
	// * U+212B should be included into the "ignore" set because there are no other
	// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
	// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
	// uppercase to U+212B.
	// * U+00C5 and U+00E5 will both be included into the "special add" set. While
	// calculate the "equivlant set" under ECMA262 "i" mode, the process will
	// add U+00E5, because it is not an uppercase character in the set. The
	// process will also add U+00C5, because it is the uppercase character which
	// other non uppercase character, U+00C5, uppercase into.
	//
	// For characters not included in "ignore set" and "special add set", the
	// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
	// much faster.
	//
	// Under Unicode 12.0, there are only 7 characters in the "special add set" and
	// 4 characters in "ignore set" so even the special add process is slower, it is
	// limited to a small set of cases only.
	//
	// The implementation of these two function will be generated by calling ICU
	// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
	// the code in src/regexp/gen-regexp-special-case.cc.
	//
	// These two function will be used with LazyInstance<> template to generate
	// global sharable set to reduce memory usage and speed up performance.

	// Function to build and return the Ignore set.
	icu::UnicodeSet BuildIgnoreSet();

	// Function to build and return the Special Add set.
	icu::UnicodeSet BuildSpecialAddSet();

	} // namespace internal
	} // namespace v8

	#endif // V8_INTL_SUPPORT

	#endif // V8_REGEXP_SPECIAL_CASE_H_