blob: 5df21525399180bb37d0e22d03e1b191e48dedb0 [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
* vim: set ts=8 sts=4 et sw=4 tw=99:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef vm_Unicode_h
#define vm_Unicode_h
#include "jspubtd.h"
extern const bool js_isidstart[];
extern const bool js_isident[];
extern const bool js_isspace[];
namespace js {
namespace unicode {
/*
* This enum contains the all the knowledge required to handle
* Unicode in JavaScript.
*
* SPACE
* Every character that is either in the ECMA-262 5th Edition
* class WhiteSpace or LineTerminator.
*
* WhiteSpace
* \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
* and every other Unicode character with the General Category "Zs".
* In pratice this is every character with the value "Zs" as the third
* field (after the char code in hex, and the name) called General_Category
* (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
* in the file UnicodeData.txt.
*
* LineTerminator
* \u000A, \u000D, \u2028, \u2029
*
* LETTER
* This are all characters included UnicodeLetter from ECMA-262.
* This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
*
* IDENTIFIER_PART
* This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
* Aka categories Mn/Mc, Md, Nd, Pc
* And <ZWNJ> and <ZWJ>.
* Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
* a matcher for the real IdentifierPart like this:
*
* if isEscapeSequence():
* handleEscapeSequence()
* return True
* if char in ['$', '_']:
* return True
* if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
* return True
*
* NO_DELTA
* See comment in CharacterInfo
*/
struct CharFlag {
enum temp {
SPACE = 1 << 0,
LETTER = 1 << 1,
IDENTIFIER_PART = 1 << 2,
NO_DELTA = 1 << 3
};
};
const jschar BYTE_ORDER_MARK2 = 0xFFFE;
const jschar NO_BREAK_SPACE = 0x00A0;
class CharacterInfo {
/*
* upperCase and loweCase normally store the delta between two
* letters. For example the lower case alpha (a) has the char code
* 97, and the upper case alpha (A) has 65. So for "a" we would
* store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
* because this char is already in lower case.
* Well, not -32 exactly, but (2**16 - 32) to induce
* unsigned overflow with identical mathematical behavior.
* For upper case alpha, we would store 0 in upperCase and 32 in
* lowerCase (65 + 32 = 97).
*
* If the delta between the chars wouldn't fit in a T, the flag
* FLAG_NO_DELTA is set, and you can just use upperCase and lowerCase
* without adding them the base char. See CharInfo.toUpperCase().
*
* We use deltas to reuse information for multiple characters. For
* example the whole lower case latin alphabet fits into one entry,
* because it's always a UnicodeLetter and upperCase contains
* -32.
*/
public:
uint16_t upperCase;
uint16_t lowerCase;
uint8_t flags;
inline bool isSpace() const {
return flags & CharFlag::SPACE;
}
inline bool isLetter() const {
return flags & CharFlag::LETTER;
}
inline bool isIdentifierPart() const {
return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
}
};
extern const uint8_t index1[];
extern const uint8_t index2[];
extern const CharacterInfo js_charinfo[];
inline const CharacterInfo&
CharInfo(jschar code)
{
const size_t shift = 5;
size_t index = index1[code >> shift];
index = index2[(index << shift) + (code & ((1 << shift) - 1))];
return js_charinfo[index];
}
inline bool
IsIdentifierStart(jschar ch)
{
/*
* ES5 7.6 IdentifierStart
* $ (dollar sign)
* _ (underscore)
* or any UnicodeLetter.
*
* We use a lookup table for small and thus common characters for speed.
*/
if (ch < 128)
return js_isidstart[ch];
return CharInfo(ch).isLetter();
}
inline bool
IsIdentifierPart(jschar ch)
{
/* Matches ES5 7.6 IdentifierPart. */
if (ch < 128)
return js_isident[ch];
return CharInfo(ch).isIdentifierPart();
}
inline bool
IsLetter(jschar ch)
{
return CharInfo(ch).isLetter();
}
inline bool
IsSpace(jschar ch)
{
/*
* IsSpace checks if some character is included in the merged set
* of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
* We combined them, because in practice nearly every
* calling function wants this, except some code in the tokenizer.
*
* We use a lookup table for ASCII-7 characters, because they are
* very common and must be handled quickly in the tokenizer.
* NO-BREAK SPACE is supposed to be the most common character not in
* this range, so we inline this case, too.
*/
if (ch < 128)
return js_isspace[ch];
if (ch == NO_BREAK_SPACE)
return true;
return CharInfo(ch).isSpace();
}
inline bool
IsSpaceOrBOM2(jschar ch)
{
if (ch < 128)
return js_isspace[ch];
/* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
return true;
return CharInfo(ch).isSpace();
}
inline jschar
ToUpperCase(jschar ch)
{
const CharacterInfo &info = CharInfo(ch);
/*
* The delta didn't fit into T, so we had to store the
* actual char code.
*/
if (info.flags & CharFlag::NO_DELTA)
return info.upperCase;
return uint16_t(ch) + info.upperCase;
}
inline jschar
ToLowerCase(jschar ch)
{
const CharacterInfo &info = CharInfo(ch);
if (info.flags & CharFlag::NO_DELTA)
return info.lowerCase;
return uint16_t(ch) + info.lowerCase;
}
} /* namespace unicode */
} /* namespace js */
#endif /* vm_Unicode_h */