/*
 * Copyright 2014 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef COBALT_CSS_PARSER_SCANNER_H_
#define COBALT_CSS_PARSER_SCANNER_H_

#include <deque>
#include <stack>
#include <string>

#include "base/basictypes.h"
#include "cobalt/css_parser/grammar.h"
#include "cobalt/cssom/media_feature_names.h"
#include "third_party/icu/public/common/unicode/umachine.h"

namespace cobalt {
namespace css_parser {

class StringPool;
typedef int Token;
struct TrivialStringPiece;

// Being initialized with a zero-terminated input string in UTF-8 encoding,
// a scanner produces a series of CSS tokens, ending with kEndOfFileToken.
// The scanner is designed to be resilient to errors in the input and is
// supposed to reach the end of the input in any case.
//
// The scanner tries to minimize the amount of string copying, with most
// returned strings referencing the input. If a string copy is needed because
// a string contains an escape sequence, the copy will be allocated on an
// injected string pool.
//
// This implementation is an adaptation of the hand-written CSS scanner from
// WebKit that has shown a 2x performance improvement over the one generated
// by Flex. See https://bugs.webkit.org/show_bug.cgi?id=70107.
//
// Detailed information on the CSS grammar can be found at
// https://www.w3.org/TR/css-syntax-3/.
class Scanner {
 public:
  Scanner(const char* input_iterator, StringPool* string_pool);

  // Recognizes a token under the input iterator.
  // Returns a token, initializes its value (if any) and advances the input
  // iterator accordingly. Guaranteed to succeed because unrecognized characters
  // are returned as is (and are expected to trigger a parser error).
  Token Scan(TokenValue* token_value, YYLTYPE* token_location);

  // Token injection, used by the parser to choose an entry point.
  void PrependToken(Token token);
  bool DetectPropertyNameToken(const std::string& property_name,
                               Token* property_name_token) const;

 private:
  // Parsing modes are the equivalent of Flex start conditions:
  // http://flex.sourceforge.net/manual/Start-Conditions.html
  enum ParsingMode {
    // A scanner is initialized in this mode and spends most of the time in it.
    kNormalMode,

    // A scanner enters this mode after seeing "@media" or "@import" and exits
    // it after ";" or "{". The mode is needed to recognize "and", "not", "only"
    // as keywords (which are treated as identifiers or function names in
    // normal mode).
    kMediaQueryMode,

    // A scanner enters this mode after seeing "@supports" and exits it after
    // ";" or "{". The mode is needed to recognize "and", "not", "or" as
    // keywords (which are treated as identifiers or function names in normal
    // mode).
    kSupportsMode,

    // A scanner enters this mode after seeing "nth-child" or similar function
    // and exits it after ")". The mode is needed to recognize "an+b"
    // microsyntax: https://www.w3.org/TR/css-syntax-3/#anb
    kNthChildMode
  };

  // ScanFrom*() methods are called by a main scanner loop when an input
  // iterator is pointing at a character of a corresponding type. These methods
  // are guaranteed to succeed. They always return a token, initialize its value
  // (if any) and advance the input iterator accordingly.
  Token ScanFromCaselessU(TokenValue* token_value);
  Token ScanFromIdentifierStart(TokenValue* token_value);
  Token ScanFromDot(TokenValue* token_value);
  Token ScanFromNumber(TokenValue* token_value);
  Token ScanFromDash(TokenValue* token_value);
  Token ScanFromOtherCharacter();
  Token ScanFromNull();  // Does not advance input_iterator_ beyond
                         // the end of input.
  Token ScanFromWhitespace();
  Token ScanFromEndMediaQueryOrSupports();
  Token ScanFromEndNthChild();
  Token ScanFromQuote(TokenValue* token_value);
  Token ScanFromExclamationMark(TokenValue* token_value);
  Token ScanFromHashmark(TokenValue* token_value);
  Token ScanFromSlash();
  Token ScanFromDollar();
  Token ScanFromAsterisk();
  Token ScanFromPlus(TokenValue* token_value);
  Token ScanFromLess();
  Token ScanFromAt(TokenValue* token_value);
  Token ScanFromBackSlash(TokenValue* token_value);
  Token ScanFromXor();
  Token ScanFromVerticalBar();
  Token ScanFromTilde();

  // Scan*() methods are much like ScanFrom*() except they recognize only one
  // type of tokens.
  //
  // TryScan*() methods also recognize only one type of tokens but are not
  // guaranteed to succeed. If they fail to recognize a token, they return false
  // and restore a position of the input iterator.
  //
  // Detect*() methods check whether a previously scanned identifier
  // matches one of CSS keywords. If they they succeed, they initialize a token
  // type and return true, otherwise they return false and leave a token type
  // intact.
  //
  // Detect*AndMaybeChangeParsingMode() methods may change a parsing mode
  // if they match an appropriate CSS keyword. The parsing mode may stay intact
  // even if a method succeeds.
  //
  // TryScanAndMaybeCopy*() methods are intended to be called twice: first time
  // to try scanning a token through a fast track that does not require copying,
  // second time through a slower copying track if an escape sequence is found
  // during the first pass.
  bool TryScanUnicodeRange(TrivialIntPair* value);
  void ScanIdentifier(TrivialStringPiece* value, bool* has_escape);
  bool IsInputIteratorAtIdentifierStart() const;
  template <bool copy>
  bool TryScanAndMaybeCopyIdentifier(TrivialStringPiece* value,
                                     std::string* value_copy);
  UChar32 ScanEscape();  // Escape is not a token, it is a part of string.
  bool DetectPropertyNameToken(const TrivialStringPiece& name,
                               Token* property_name_token) const;
  bool DetectPropertyValueToken(const TrivialStringPiece& name,
                                Token* property_value_token) const;
  bool DetectPseudoClassNameToken(const TrivialStringPiece& name,
                                  Token* pseudo_class_name_token) const;
  bool DetectPseudoElementNameToken(const TrivialStringPiece& name,
                                    Token* pseudo_element_name_token) const;
  bool DetectSupportsToken(const TrivialStringPiece& name,
                           Token* supports_token) const;
  bool DetectKnownFunctionTokenAndMaybeChangeParsingMode(
      const TrivialStringPiece& name, Token* known_function_token);
  bool DetectMediaQueryToken(const TrivialStringPiece& name,
                             Token* media_query_token,
                             cssom::MediaFeatureName* media_feature_name) const;
  bool TryScanUri(TrivialStringPiece* uri);
  bool FindUri(const char** uri_start, const char** uri_end, char* quote) const;
  template <bool copy>
  bool TryScanAndMaybeCopyUri(char quote, TrivialStringPiece* uri,
                              std::string* uri_copy);
  void ScanString(char quote, TrivialStringPiece* value);
  template <bool copy>
  bool TryScanAndMaybeCopyString(char quote, TrivialStringPiece* value,
                                 std::string* value_copy);
  bool TryScanNthChild(TrivialStringPiece* nth);
  bool TryScanNthChildExtra(TrivialStringPiece* nth);
  bool DetectUnitToken(const TrivialStringPiece& unit, Token* token) const;
  bool DetectAtTokenAndMaybeChangeParsingMode(const TrivialStringPiece& name,
                                              bool has_escape, Token* at_token);

  bool DetectMediaFeatureNamePrefix(Token* token);
  void ScanUnrecognizedAtRule();
  void HandleBraceIfExists(char character);

  const char* input_iterator_;
  StringPool* const string_pool_;

  ParsingMode parsing_mode_;

  int line_number_;
  const char* line_start_;

  std::deque<Token> prepended_tokens_;

  // Used to cache the open braces and close them if no matching at the end of
  // input.
  std::stack<char> open_braces_;

  DISALLOW_COPY_AND_ASSIGN(Scanner);
};

// By Bison's convention, a function that returns next token, should
// be named yylex().
inline Token yylex(TokenValue* token_value, YYLTYPE* token_location,
                   Scanner* scanner) {
  return scanner->Scan(token_value, token_location);
}

}  // namespace css_parser
}  // namespace cobalt

#endif  // COBALT_CSS_PARSER_SCANNER_H_
