blob: 8c2d590fcc574533ace4e156b7f193b740c95da2 [file] [log] [blame]
// Copyright 2014 Google Inc. All Rights Reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#include <deque>
#include <stack>
#include <string>
#include "base/basictypes.h"
#include "cobalt/css_parser/grammar.h"
#include "cobalt/cssom/media_feature_names.h"
#include "third_party/icu/source/common/unicode/umachine.h"
namespace cobalt {
namespace css_parser {
class StringPool;
typedef int Token;
struct TrivialStringPiece;
// Being initialized with a zero-terminated input string in UTF-8 encoding,
// a scanner produces a series of CSS tokens, ending with kEndOfFileToken.
// The scanner is designed to be resilient to errors in the input and is
// supposed to reach the end of the input in any case.
// The scanner tries to minimize the amount of string copying, with most
// returned strings referencing the input. If a string copy is needed because
// a string contains an escape sequence, the copy will be allocated on an
// injected string pool.
// This implementation is an adaptation of the hand-written CSS scanner from
// WebKit that has shown a 2x performance improvement over the one generated
// by Flex. See
// Detailed information on the CSS grammar can be found at
class Scanner {
Scanner(const char* input_iterator, StringPool* string_pool);
// Recognizes a token under the input iterator.
// Returns a token, initializes its value (if any) and advances the input
// iterator accordingly. Guaranteed to succeed because unrecognized characters
// are returned as is (and are expected to trigger a parser error).
Token Scan(TokenValue* token_value, YYLTYPE* token_location);
// Token injection, used by the parser to choose an entry point.
void PrependToken(Token token);
bool DetectPropertyNameToken(const std::string& property_name,
Token* property_name_token) const;
// Parsing modes are the equivalent of Flex start conditions:
enum ParsingMode {
// A scanner is initialized in this mode and spends most of the time in it.
// A scanner enters this mode after seeing "@media" or "@import" and exits
// it after ";" or "{". The mode is needed to recognize "and", "not", "only"
// as keywords (which are treated as identifiers or function names in
// normal mode).
// A scanner enters this mode after seeing "@supports" and exits it after
// ";" or "{". The mode is needed to recognize "and", "not", "or" as
// keywords (which are treated as identifiers or function names in normal
// mode).
// A scanner enters this mode after seeing "nth-child" or similar function
// and exits it after ")". The mode is needed to recognize "an+b"
// microsyntax:
// ScanFrom*() methods are called by a main scanner loop when an input
// iterator is pointing at a character of a corresponding type. These methods
// are guaranteed to succeed. They always return a token, initialize its value
// (if any) and advance the input iterator accordingly.
Token ScanFromCaselessU(TokenValue* token_value);
Token ScanFromIdentifierStart(TokenValue* token_value);
Token ScanFromDot(TokenValue* token_value);
Token ScanFromNumber(TokenValue* token_value);
Token ScanFromDash(TokenValue* token_value);
Token ScanFromOtherCharacter();
Token ScanFromNull(); // Does not advance input_iterator_ beyond
// the end of input.
Token ScanFromWhitespace();
Token ScanFromEndMediaQueryOrSupports();
Token ScanFromEndNthChild();
Token ScanFromQuote(TokenValue* token_value);
Token ScanFromExclamationMark(TokenValue* token_value);
Token ScanFromHashmark(TokenValue* token_value);
Token ScanFromSlash();
Token ScanFromDollar();
Token ScanFromAsterisk();
Token ScanFromPlus(TokenValue* token_value);
Token ScanFromLess();
Token ScanFromAt(TokenValue* token_value);
Token ScanFromBackSlash(TokenValue* token_value);
Token ScanFromXor();
Token ScanFromVerticalBar();
Token ScanFromTilde();
// Scan*() methods are much like ScanFrom*() except they recognize only one
// type of tokens.
// TryScan*() methods also recognize only one type of tokens but are not
// guaranteed to succeed. If they fail to recognize a token, they return false
// and restore a position of the input iterator.
// Detect*() methods check whether a previously scanned identifier
// matches one of CSS keywords. If they they succeed, they initialize a token
// type and return true, otherwise they return false and leave a token type
// intact.
// Detect*AndMaybeChangeParsingMode() methods may change a parsing mode
// if they match an appropriate CSS keyword. The parsing mode may stay intact
// even if a method succeeds.
// TryScanAndMaybeCopy*() methods are intended to be called twice: first time
// to try scanning a token through a fast track that does not require copying,
// second time through a slower copying track if an escape sequence is found
// during the first pass.
bool TryScanUnicodeRange(TrivialIntPair* value);
void ScanIdentifier(TrivialStringPiece* value, bool* has_escape);
bool IsInputIteratorAtIdentifierStart() const;
template <bool copy>
bool TryScanAndMaybeCopyIdentifier(TrivialStringPiece* value,
std::string* value_copy);
UChar32 ScanEscape(); // Escape is not a token, it is a part of string.
bool DetectPropertyNameToken(const TrivialStringPiece& name,
Token* property_name_token) const;
bool DetectPropertyValueToken(const TrivialStringPiece& name,
Token* property_value_token) const;
bool DetectPseudoClassNameToken(const TrivialStringPiece& name,
Token* pseudo_class_name_token) const;
bool DetectPseudoElementNameToken(const TrivialStringPiece& name,
Token* pseudo_element_name_token) const;
bool DetectSupportsToken(const TrivialStringPiece& name,
Token* supports_token) const;
bool DetectKnownFunctionTokenAndMaybeChangeParsingMode(
const TrivialStringPiece& name, Token* known_function_token);
bool DetectMediaQueryToken(const TrivialStringPiece& name,
Token* media_query_token,
cssom::MediaFeatureName* media_feature_name) const;
bool TryScanUri(TrivialStringPiece* uri);
bool FindUri(const char** uri_start, const char** uri_end, char* quote) const;
template <bool copy>
bool TryScanAndMaybeCopyUri(char quote, TrivialStringPiece* uri,
std::string* uri_copy);
void ScanString(char quote, TrivialStringPiece* value);
template <bool copy>
bool TryScanAndMaybeCopyString(char quote, TrivialStringPiece* value,
std::string* value_copy);
bool TryScanNthChild(TrivialStringPiece* nth);
bool TryScanNthChildExtra(TrivialStringPiece* nth);
bool DetectUnitToken(const TrivialStringPiece& unit, Token* token) const;
bool DetectAtTokenAndMaybeChangeParsingMode(const TrivialStringPiece& name,
bool has_escape, Token* at_token);
bool DetectMediaFeatureNamePrefix(Token* token);
void ScanUnrecognizedAtRule();
void HandleBraceIfExists(char character);
const char* input_iterator_;
StringPool* const string_pool_;
ParsingMode parsing_mode_;
int line_number_;
const char* line_start_;
std::deque<Token> prepended_tokens_;
// Used to cache the open braces and close them if no matching at the end of
// input.
std::stack<char> open_braces_;
// By Bison's convention, a function that returns next token, should
// be named yylex().
inline Token yylex(TokenValue* token_value, YYLTYPE* token_location,
Scanner* scanner) {
return scanner->Scan(token_value, token_location);
} // namespace css_parser
} // namespace cobalt