third_party/v8/src/regexp/special-case.h - cobalt - Git at Google

 // Copyright 2019 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef V8_REGEXP_SPECIAL_CASE_H_
 #define V8_REGEXP_SPECIAL_CASE_H_

 #ifdef V8_INTL_SUPPORT
 #include "src/base/logging.h"
 #include "src/common/globals.h"

 #include "unicode/uchar.h"
 #include "unicode/uniset.h"
 #include "unicode/unistr.h"

 namespace v8 {
 namespace internal {

 // Sets of Unicode characters that need special handling under "i" mode

 // For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
 // defines slightly different case-folding rules than Unicode. An
 // input character should match a pattern character if the result of
 // the Canonicalize algorithm is the same for both characters.
 //
 // Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
 // c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
 // string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
 // https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
 // the precise definition.
 //
 // While compiling such regular expressions, we need to compute the
 // set of characters that should match a given input character. (See
 // GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
 // For almost all characters, this can be efficiently computed using
 // UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
 // the remaining special cases.
 //
 // For a character c, the rules are as follows:
 //
 // 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
 //    UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
 //    containing c will produce the set of characters that should
 //    match /c/i (or /[c]/i), and only those characters.
 //
 // 2. If c is in IgnoreSet, then the only character it should match is
 //    itself. However, closeOver will add additional incorrect
 //    matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
 //    (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
 //    "SS".  Step 3.e therefore requires that 'ß' canonicalizes to
 //    itself, and should not match 'ẞ'. In these cases, we can skip
 //    the closeOver entirely, because it will never add an equivalent
 //    character.
 //
 // 3. If c is in SpecialAddSet, then it should match at least one
 //    character other than itself. However, closeOver will add at
 //    least one additional incorrect match. For example, consider the
 //    letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
 //    K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
 //    SIGN should not match either of the other two characters. As a
 //    result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
 //    IgnoreSet). To find the correct matches for characters in
 //    SpecialAddSet, we closeOver the original character, but filter
 //    out the results that do not have the same canonical value.
 //
 // The contents of these sets are calculated at build time by
 // src/regexp/gen-regexp-special-case.cc, which generates
 // gen/src/regexp/special-case.cc. This is done by iterating over the
 // result of closeOver for each BMP character, and finding sets for
 // which at least one character has a different canonical value than
 // another character. Characters that match no other characters in
 // their equivalence class are added to IgnoreSet. Characters that
 // match at least one other character are added to SpecialAddSet.

 class RegExpCaseFolding final : public AllStatic {
  public:
   static const icu::UnicodeSet& IgnoreSet();
   static const icu::UnicodeSet& SpecialAddSet();

   // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
   // Canonicalize) step 3, which is used to determine whether
   // characters match when ignoreCase is true and unicode is false.
   static UChar32 Canonicalize(UChar32 ch) {
     // a. Assert: ch is a UTF-16 code unit.
     CHECK_LE(ch, 0xffff);

     // b. Let s be the String value consisting of the single code unit ch.
     icu::UnicodeString s(ch);

     // c. Let u be the same result produced as if by performing the algorithm
     // for String.prototype.toUpperCase using s as the this value.
     // d. Assert: Type(u) is String.
     icu::UnicodeString& u = s.toUpper();

     // e. If u does not consist of a single code unit, return ch.
     if (u.length() != 1) {
       return ch;
     }

     // f. Let cu be u's single code unit element.
     UChar32 cu = u.char32At(0);

     // g. If the value of ch >= 128 and the value of cu < 128, return ch.
     if (ch >= 128 && cu < 128) {
       return ch;
     }

     // h. Return cu.
     return cu;
   }
 };

 }  // namespace internal
 }  // namespace v8

 #endif  // V8_INTL_SUPPORT

 #endif  // V8_REGEXP_SPECIAL_CASE_H_
	// Copyright 2019 the V8 project authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef V8_REGEXP_SPECIAL_CASE_H_
	#define V8_REGEXP_SPECIAL_CASE_H_

	#ifdef V8_INTL_SUPPORT
	#include "src/base/logging.h"
	#include "src/common/globals.h"

	#include "unicode/uchar.h"
	#include "unicode/uniset.h"
	#include "unicode/unistr.h"

	namespace v8 {
	namespace internal {

	// Sets of Unicode characters that need special handling under "i" mode

	// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
	// defines slightly different case-folding rules than Unicode. An
	// input character should match a pattern character if the result of
	// the Canonicalize algorithm is the same for both characters.
	//
	// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
	// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
	// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
	// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
	// the precise definition.
	//
	// While compiling such regular expressions, we need to compute the
	// set of characters that should match a given input character. (See
	// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
	// For almost all characters, this can be efficiently computed using
	// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
	// the remaining special cases.
	//
	// For a character c, the rules are as follows:
	//
	// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
	// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
	// containing c will produce the set of characters that should
	// match /c/i (or /[c]/i), and only those characters.
	//
	// 2. If c is in IgnoreSet, then the only character it should match is
	// itself. However, closeOver will add additional incorrect
	// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
	// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
	// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
	// itself, and should not match 'ẞ'. In these cases, we can skip
	// the closeOver entirely, because it will never add an equivalent
	// character.
	//
	// 3. If c is in SpecialAddSet, then it should match at least one
	// character other than itself. However, closeOver will add at
	// least one additional incorrect match. For example, consider the
	// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
	// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
	// SIGN should not match either of the other two characters. As a
	// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
	// IgnoreSet). To find the correct matches for characters in
	// SpecialAddSet, we closeOver the original character, but filter
	// out the results that do not have the same canonical value.
	//
	// The contents of these sets are calculated at build time by
	// src/regexp/gen-regexp-special-case.cc, which generates
	// gen/src/regexp/special-case.cc. This is done by iterating over the
	// result of closeOver for each BMP character, and finding sets for
	// which at least one character has a different canonical value than
	// another character. Characters that match no other characters in
	// their equivalence class are added to IgnoreSet. Characters that
	// match at least one other character are added to SpecialAddSet.

	class RegExpCaseFolding final : public AllStatic {
	public:
	static const icu::UnicodeSet& IgnoreSet();
	static const icu::UnicodeSet& SpecialAddSet();

	// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
	// Canonicalize) step 3, which is used to determine whether
	// characters match when ignoreCase is true and unicode is false.
	static UChar32 Canonicalize(UChar32 ch) {
	// a. Assert: ch is a UTF-16 code unit.
	CHECK_LE(ch, 0xffff);

	// b. Let s be the String value consisting of the single code unit ch.
	icu::UnicodeString s(ch);

	// c. Let u be the same result produced as if by performing the algorithm
	// for String.prototype.toUpperCase using s as the this value.
	// d. Assert: Type(u) is String.
	icu::UnicodeString& u = s.toUpper();

	// e. If u does not consist of a single code unit, return ch.
	if (u.length() != 1) {
	return ch;
	}

	// f. Let cu be u's single code unit element.
	UChar32 cu = u.char32At(0);

	// g. If the value of ch >= 128 and the value of cu < 128, return ch.
	if (ch >= 128 && cu < 128) {
	return ch;
	}

	// h. Return cu.
	return cu;
	}
	};

	} // namespace internal
	} // namespace v8

	#endif // V8_INTL_SUPPORT

	#endif // V8_REGEXP_SPECIAL_CASE_H_