src/third_party/mozjs/js/src/vm/Unicode.h - cobalt - Git at Google

 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
  * vim: set ts=8 sts=4 et sw=4 tw=99:
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 #ifndef vm_Unicode_h
 #define vm_Unicode_h

 #include "jspubtd.h"

 extern const bool js_isidstart[];
 extern const bool js_isident[];
 extern const bool js_isspace[];

 namespace js {
 namespace unicode {

 /*
  * This enum contains the all the knowledge required to handle
  * Unicode in JavaScript.
  *
  * SPACE
  *   Every character that is either in the ECMA-262 5th Edition
  *   class WhiteSpace or LineTerminator.
  *
  *   WhiteSpace
  *    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
  *    and every other Unicode character with the General Category "Zs".
  *    In pratice this is every character with the value "Zs" as the third
  *    field (after the char code in hex, and the name) called General_Category
  *    (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
  *     in the file UnicodeData.txt.
  *
  *   LineTerminator
  *    \u000A, \u000D, \u2028, \u2029
  *
  * LETTER
  *   This are all characters included UnicodeLetter from ECMA-262.
  *   This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
  *
  * IDENTIFIER_PART
  *   This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
  *   Aka categories Mn/Mc, Md, Nd, Pc
  *   And <ZWNJ> and <ZWJ>.
  *   Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
  *   a matcher for the real IdentifierPart like this:
  *
  *   if isEscapeSequence():
  *      handleEscapeSequence()
  *      return True
  *   if char in ['$', '_']:
  *      return True
  *   if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
  *      return True
  *
  * NO_DELTA
  *   See comment in CharacterInfo
  */

 struct CharFlag {
     enum temp {
         SPACE  = 1 << 0,
         LETTER = 1 << 1,
         IDENTIFIER_PART = 1 << 2,
         NO_DELTA = 1 << 3
     };
 };

 const jschar BYTE_ORDER_MARK2 = 0xFFFE;
 const jschar NO_BREAK_SPACE  = 0x00A0;

 class CharacterInfo {
     /*
      * upperCase and loweCase normally store the delta between two
      * letters. For example the lower case alpha (a) has the char code
      * 97, and the upper case alpha (A) has 65. So for "a" we would
      * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
      * because this char is already in lower case.
      * Well, not -32 exactly, but (2**16 - 32) to induce
      * unsigned overflow with identical mathematical behavior.
      * For upper case alpha, we would store 0 in upperCase and 32 in
      * lowerCase (65 + 32 = 97).
      *
      * If the delta between the chars wouldn't fit in a T, the flag
      * FLAG_NO_DELTA is set, and you can just use upperCase and lowerCase
      * without adding them the base char. See CharInfo.toUpperCase().
      *
      * We use deltas to reuse information for multiple characters. For
      * example the whole lower case latin alphabet fits into one entry,
      * because it's always a UnicodeLetter and upperCase contains
      * -32.
      */
   public:
     uint16_t upperCase;
     uint16_t lowerCase;
     uint8_t flags;

     inline bool isSpace() const {
         return flags & CharFlag::SPACE;
     }

     inline bool isLetter() const {
         return flags & CharFlag::LETTER;
     }

     inline bool isIdentifierPart() const {
         return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
     }
 };

 extern const uint8_t index1[];
 extern const uint8_t index2[];
 extern const CharacterInfo js_charinfo[];

 inline const CharacterInfo&
 CharInfo(jschar code)
 {
     const size_t shift = 5;
     size_t index = index1[code >> shift];
     index = index2[(index << shift) + (code & ((1 << shift) - 1))];

     return js_charinfo[index];
 }

 inline bool
 IsIdentifierStart(jschar ch)
 {
     /*
      * ES5 7.6 IdentifierStart
      *  $ (dollar sign)
      *  _ (underscore)
      *  or any UnicodeLetter.
      *
      * We use a lookup table for small and thus common characters for speed.
      */

     if (ch < 128)
         return js_isidstart[ch];

     return CharInfo(ch).isLetter();
 }

 inline bool
 IsIdentifierPart(jschar ch)
 {
     /* Matches ES5 7.6 IdentifierPart. */

     if (ch < 128)
         return js_isident[ch];

     return CharInfo(ch).isIdentifierPart();
 }

 inline bool
 IsLetter(jschar ch)
 {
     return CharInfo(ch).isLetter();
 }

 inline bool
 IsSpace(jschar ch)
 {
     /*
      * IsSpace checks if some character is included in the merged set
      * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
      * We combined them, because in practice nearly every
      * calling function wants this, except some code in the tokenizer.
      *
      * We use a lookup table for ASCII-7 characters, because they are
      * very common and must be handled quickly in the tokenizer.
      * NO-BREAK SPACE is supposed to be the most common character not in
      * this range, so we inline this case, too.
      */

     if (ch < 128)
         return js_isspace[ch];

     if (ch == NO_BREAK_SPACE)
         return true;

     return CharInfo(ch).isSpace();
 }

 inline bool
 IsSpaceOrBOM2(jschar ch)
 {
     if (ch < 128)
         return js_isspace[ch];

     /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
     if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
         return true;

     return CharInfo(ch).isSpace();
 }

 inline jschar
 ToUpperCase(jschar ch)
 {
     const CharacterInfo &info = CharInfo(ch);

     /*
      * The delta didn't fit into T, so we had to store the
      * actual char code.
      */
     if (info.flags & CharFlag::NO_DELTA)
         return info.upperCase;

     return uint16_t(ch) + info.upperCase;
 }

 inline jschar
 ToLowerCase(jschar ch)
 {
     const CharacterInfo &info = CharInfo(ch);

     if (info.flags & CharFlag::NO_DELTA)
         return info.lowerCase;

     return uint16_t(ch) + info.lowerCase;
 }

 } /* namespace unicode */
 } /* namespace js */

 #endif /* vm_Unicode_h */
	/* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 --
	* vim: set ts=8 sts=4 et sw=4 tw=99:
	* This Source Code Form is subject to the terms of the Mozilla Public
	* License, v. 2.0. If a copy of the MPL was not distributed with this
	* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	#ifndef vm_Unicode_h
	#define vm_Unicode_h

	#include "jspubtd.h"

	extern const bool js_isidstart[];
	extern const bool js_isident[];
	extern const bool js_isspace[];

	namespace js {
	namespace unicode {

	/*
	* This enum contains the all the knowledge required to handle
	* Unicode in JavaScript.
	*
	* SPACE
	* Every character that is either in the ECMA-262 5th Edition
	* class WhiteSpace or LineTerminator.
	*
	* WhiteSpace
	* \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
	* and every other Unicode character with the General Category "Zs".
	* In pratice this is every character with the value "Zs" as the third
	* field (after the char code in hex, and the name) called General_Category
	* (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
	* in the file UnicodeData.txt.
	*
	* LineTerminator
	* \u000A, \u000D, \u2028, \u2029
	*
	* LETTER
	* This are all characters included UnicodeLetter from ECMA-262.
	* This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
	*
	* IDENTIFIER_PART
	* This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
	* Aka categories Mn/Mc, Md, Nd, Pc
	* And <ZWNJ> and <ZWJ>.
	* Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
	* a matcher for the real IdentifierPart like this:
	*
	* if isEscapeSequence():
	* handleEscapeSequence()
	* return True
	* if char in ['$', '_']:
	* return True
	* if GetFlag(char) & (FLAG_IDENTIFIER_PART \| FLAG_LETTER):
	* return True
	*
	* NO_DELTA
	* See comment in CharacterInfo
	*/

	struct CharFlag {
	enum temp {
	SPACE = 1 << 0,
	LETTER = 1 << 1,
	IDENTIFIER_PART = 1 << 2,
	NO_DELTA = 1 << 3
	};
	};

	const jschar BYTE_ORDER_MARK2 = 0xFFFE;
	const jschar NO_BREAK_SPACE = 0x00A0;

	class CharacterInfo {
	/*
	* upperCase and loweCase normally store the delta between two
	* letters. For example the lower case alpha (a) has the char code
	* 97, and the upper case alpha (A) has 65. So for "a" we would
	* store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
	* because this char is already in lower case.
	* Well, not -32 exactly, but (2**16 - 32) to induce
	* unsigned overflow with identical mathematical behavior.
	* For upper case alpha, we would store 0 in upperCase and 32 in
	* lowerCase (65 + 32 = 97).
	*
	* If the delta between the chars wouldn't fit in a T, the flag
	* FLAG_NO_DELTA is set, and you can just use upperCase and lowerCase
	* without adding them the base char. See CharInfo.toUpperCase().
	*
	* We use deltas to reuse information for multiple characters. For
	* example the whole lower case latin alphabet fits into one entry,
	* because it's always a UnicodeLetter and upperCase contains
	* -32.
	*/
	public:
	uint16_t upperCase;
	uint16_t lowerCase;
	uint8_t flags;

	inline bool isSpace() const {
	return flags & CharFlag::SPACE;
	}

	inline bool isLetter() const {
	return flags & CharFlag::LETTER;
	}

	inline bool isIdentifierPart() const {
	return flags & (CharFlag::IDENTIFIER_PART \| CharFlag::LETTER);
	}
	};

	extern const uint8_t index1[];
	extern const uint8_t index2[];
	extern const CharacterInfo js_charinfo[];

	inline const CharacterInfo&
	CharInfo(jschar code)
	{
	const size_t shift = 5;
	size_t index = index1[code >> shift];
	index = index2[(index << shift) + (code & ((1 << shift) - 1))];

	return js_charinfo[index];
	}

	inline bool
	IsIdentifierStart(jschar ch)
	{
	/*
	* ES5 7.6 IdentifierStart
	* $ (dollar sign)
	* _ (underscore)
	* or any UnicodeLetter.
	*
	* We use a lookup table for small and thus common characters for speed.
	*/

	if (ch < 128)
	return js_isidstart[ch];

	return CharInfo(ch).isLetter();
	}

	inline bool
	IsIdentifierPart(jschar ch)
	{
	/* Matches ES5 7.6 IdentifierPart. */

	if (ch < 128)
	return js_isident[ch];

	return CharInfo(ch).isIdentifierPart();
	}

	inline bool
	IsLetter(jschar ch)
	{
	return CharInfo(ch).isLetter();
	}

	inline bool
	IsSpace(jschar ch)
	{
	/*
	* IsSpace checks if some character is included in the merged set
	* of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
	* We combined them, because in practice nearly every
	* calling function wants this, except some code in the tokenizer.
	*
	* We use a lookup table for ASCII-7 characters, because they are
	* very common and must be handled quickly in the tokenizer.
	* NO-BREAK SPACE is supposed to be the most common character not in
	* this range, so we inline this case, too.
	*/

	if (ch < 128)
	return js_isspace[ch];

	if (ch == NO_BREAK_SPACE)
	return true;

	return CharInfo(ch).isSpace();
	}

	inline bool
	IsSpaceOrBOM2(jschar ch)
	{
	if (ch < 128)
	return js_isspace[ch];

	/* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
	if (ch == NO_BREAK_SPACE \|\| ch == BYTE_ORDER_MARK2)
	return true;

	return CharInfo(ch).isSpace();
	}

	inline jschar
	ToUpperCase(jschar ch)
	{
	const CharacterInfo &info = CharInfo(ch);

	/*
	* The delta didn't fit into T, so we had to store the
	* actual char code.
	*/
	if (info.flags & CharFlag::NO_DELTA)
	return info.upperCase;

	return uint16_t(ch) + info.upperCase;
	}

	inline jschar
	ToLowerCase(jschar ch)
	{
	const CharacterInfo &info = CharInfo(ch);

	if (info.flags & CharFlag::NO_DELTA)
	return info.lowerCase;

	return uint16_t(ch) + info.lowerCase;
	}

	} /* namespace unicode */
	} /* namespace js */

	#endif /* vm_Unicode_h */