| from __future__ import absolute_import, division, unicode_literals |
| |
| try: |
| chr = unichr # flake8: noqa |
| except NameError: |
| pass |
| |
| from collections import deque |
| |
| from .constants import spaceCharacters |
| from .constants import entities |
| from .constants import asciiLetters, asciiUpper2Lower |
| from .constants import digits, hexDigits, EOF |
| from .constants import tokenTypes, tagTokenTypes |
| from .constants import replacementCharacters |
| |
| from .inputstream import HTMLInputStream |
| |
| from .trie import Trie |
| |
| entitiesTrie = Trie(entities) |
| |
| |
| class HTMLTokenizer(object): |
| """ This class takes care of tokenizing HTML. |
| |
| * self.currentToken |
| Holds the token that is currently being processed. |
| |
| * self.state |
| Holds a reference to the method to be invoked... XXX |
| |
| * self.stream |
| Points to HTMLInputStream object. |
| """ |
| |
| def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, |
| lowercaseElementName=True, lowercaseAttrName=True, parser=None): |
| |
| self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) |
| self.parser = parser |
| |
| # Perform case conversions? |
| self.lowercaseElementName = lowercaseElementName |
| self.lowercaseAttrName = lowercaseAttrName |
| |
| # Setup the initial tokenizer state |
| self.escapeFlag = False |
| self.lastFourChars = [] |
| self.state = self.dataState |
| self.escape = False |
| |
| # The current token being created |
| self.currentToken = None |
| super(HTMLTokenizer, self).__init__() |
| |
| def __iter__(self): |
| """ This is where the magic happens. |
| |
| We do our usually processing through the states and when we have a token |
| to return we yield the token which pauses processing until the next token |
| is requested. |
| """ |
| self.tokenQueue = deque([]) |
| # Start processing. When EOF is reached self.state will return False |
| # instead of True and the loop will terminate. |
| while self.state(): |
| while self.stream.errors: |
| yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} |
| while self.tokenQueue: |
| yield self.tokenQueue.popleft() |
| |
| def consumeNumberEntity(self, isHex): |
| """This function returns either U+FFFD or the character based on the |
| decimal or hexadecimal representation. It also discards ";" if present. |
| If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. |
| """ |
| |
| allowed = digits |
| radix = 10 |
| if isHex: |
| allowed = hexDigits |
| radix = 16 |
| |
| charStack = [] |
| |
| # Consume all the characters that are in range while making sure we |
| # don't hit an EOF. |
| c = self.stream.char() |
| while c in allowed and c is not EOF: |
| charStack.append(c) |
| c = self.stream.char() |
| |
| # Convert the set of characters consumed to an int. |
| charAsInt = int("".join(charStack), radix) |
| |
| # Certain characters get replaced with others |
| if charAsInt in replacementCharacters: |
| char = replacementCharacters[charAsInt] |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "illegal-codepoint-for-numeric-entity", |
| "datavars": {"charAsInt": charAsInt}}) |
| elif ((0xD800 <= charAsInt <= 0xDFFF) or |
| (charAsInt > 0x10FFFF)): |
| char = "\uFFFD" |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "illegal-codepoint-for-numeric-entity", |
| "datavars": {"charAsInt": charAsInt}}) |
| else: |
| # Should speed up this check somehow (e.g. move the set to a constant) |
| if ((0x0001 <= charAsInt <= 0x0008) or |
| (0x000E <= charAsInt <= 0x001F) or |
| (0x007F <= charAsInt <= 0x009F) or |
| (0xFDD0 <= charAsInt <= 0xFDEF) or |
| charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, |
| 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, |
| 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, |
| 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
| 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, |
| 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, |
| 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, |
| 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, |
| 0xFFFFF, 0x10FFFE, 0x10FFFF])): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": |
| "illegal-codepoint-for-numeric-entity", |
| "datavars": {"charAsInt": charAsInt}}) |
| try: |
| # Try/except needed as UCS-2 Python builds' unichar only works |
| # within the BMP. |
| char = chr(charAsInt) |
| except ValueError: |
| v = charAsInt - 0x10000 |
| char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) |
| |
| # Discard the ; if present. Otherwise, put it back on the queue and |
| # invoke parseError on parser. |
| if c != ";": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "numeric-entity-without-semicolon"}) |
| self.stream.unget(c) |
| |
| return char |
| |
| def consumeEntity(self, allowedChar=None, fromAttribute=False): |
| # Initialise to the default output for when no entity is matched |
| output = "&" |
| |
| charStack = [self.stream.char()] |
| if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") |
| or (allowedChar is not None and allowedChar == charStack[0])): |
| self.stream.unget(charStack[0]) |
| |
| elif charStack[0] == "#": |
| # Read the next character to see if it's hex or decimal |
| hex = False |
| charStack.append(self.stream.char()) |
| if charStack[-1] in ("x", "X"): |
| hex = True |
| charStack.append(self.stream.char()) |
| |
| # charStack[-1] should be the first digit |
| if (hex and charStack[-1] in hexDigits) \ |
| or (not hex and charStack[-1] in digits): |
| # At least one digit found, so consume the whole number |
| self.stream.unget(charStack[-1]) |
| output = self.consumeNumberEntity(hex) |
| else: |
| # No digits found |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "expected-numeric-entity"}) |
| self.stream.unget(charStack.pop()) |
| output = "&" + "".join(charStack) |
| |
| else: |
| # At this point in the process might have named entity. Entities |
| # are stored in the global variable "entities". |
| # |
| # Consume characters and compare to these to a substring of the |
| # entity names in the list until the substring no longer matches. |
| while (charStack[-1] is not EOF): |
| if not entitiesTrie.has_keys_with_prefix("".join(charStack)): |
| break |
| charStack.append(self.stream.char()) |
| |
| # At this point we have a string that starts with some characters |
| # that may match an entity |
| # Try to find the longest entity the string will match to take care |
| # of ¬i for instance. |
| try: |
| entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) |
| entityLength = len(entityName) |
| except KeyError: |
| entityName = None |
| |
| if entityName is not None: |
| if entityName[-1] != ";": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "named-entity-without-semicolon"}) |
| if (entityName[-1] != ";" and fromAttribute and |
| (charStack[entityLength] in asciiLetters or |
| charStack[entityLength] in digits or |
| charStack[entityLength] == "=")): |
| self.stream.unget(charStack.pop()) |
| output = "&" + "".join(charStack) |
| else: |
| output = entities[entityName] |
| self.stream.unget(charStack.pop()) |
| output += "".join(charStack[entityLength:]) |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-named-entity"}) |
| self.stream.unget(charStack.pop()) |
| output = "&" + "".join(charStack) |
| |
| if fromAttribute: |
| self.currentToken["data"][-1][1] += output |
| else: |
| if output in spaceCharacters: |
| tokenType = "SpaceCharacters" |
| else: |
| tokenType = "Characters" |
| self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) |
| |
| def processEntityInAttribute(self, allowedChar): |
| """This method replaces the need for "entityInAttributeValueState". |
| """ |
| self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) |
| |
| def emitCurrentToken(self): |
| """This method is a generic handler for emitting the tags. It also sets |
| the state to "data" because that's what's needed after a token has been |
| emitted. |
| """ |
| token = self.currentToken |
| # Add token to the queue to be yielded |
| if (token["type"] in tagTokenTypes): |
| if self.lowercaseElementName: |
| token["name"] = token["name"].translate(asciiUpper2Lower) |
| if token["type"] == tokenTypes["EndTag"]: |
| if token["data"]: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "attributes-in-end-tag"}) |
| if token["selfClosing"]: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "self-closing-flag-on-end-tag"}) |
| self.tokenQueue.append(token) |
| self.state = self.dataState |
| |
| # Below are the various tokenizer states worked out. |
| def dataState(self): |
| data = self.stream.char() |
| if data == "&": |
| self.state = self.entityDataState |
| elif data == "<": |
| self.state = self.tagOpenState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\u0000"}) |
| elif data is EOF: |
| # Tokenization ends. |
| return False |
| elif data in spaceCharacters: |
| # Directly after emitting a token you switch back to the "data |
| # state". At that point spaceCharacters are important so they are |
| # emitted separately. |
| self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": |
| data + self.stream.charsUntil(spaceCharacters, True)}) |
| # No need to update lastFourChars here, since the first space will |
| # have already been appended to lastFourChars and will have broken |
| # any <!-- or --> sequences |
| else: |
| chars = self.stream.charsUntil(("&", "<", "\u0000")) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
| data + chars}) |
| return True |
| |
| def entityDataState(self): |
| self.consumeEntity() |
| self.state = self.dataState |
| return True |
| |
| def rcdataState(self): |
| data = self.stream.char() |
| if data == "&": |
| self.state = self.characterReferenceInRcdata |
| elif data == "<": |
| self.state = self.rcdataLessThanSignState |
| elif data == EOF: |
| # Tokenization ends. |
| return False |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| elif data in spaceCharacters: |
| # Directly after emitting a token you switch back to the "data |
| # state". At that point spaceCharacters are important so they are |
| # emitted separately. |
| self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": |
| data + self.stream.charsUntil(spaceCharacters, True)}) |
| # No need to update lastFourChars here, since the first space will |
| # have already been appended to lastFourChars and will have broken |
| # any <!-- or --> sequences |
| else: |
| chars = self.stream.charsUntil(("&", "<", "\u0000")) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
| data + chars}) |
| return True |
| |
| def characterReferenceInRcdata(self): |
| self.consumeEntity() |
| self.state = self.rcdataState |
| return True |
| |
| def rawtextState(self): |
| data = self.stream.char() |
| if data == "<": |
| self.state = self.rawtextLessThanSignState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| elif data == EOF: |
| # Tokenization ends. |
| return False |
| else: |
| chars = self.stream.charsUntil(("<", "\u0000")) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
| data + chars}) |
| return True |
| |
| def scriptDataState(self): |
| data = self.stream.char() |
| if data == "<": |
| self.state = self.scriptDataLessThanSignState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| elif data == EOF: |
| # Tokenization ends. |
| return False |
| else: |
| chars = self.stream.charsUntil(("<", "\u0000")) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
| data + chars}) |
| return True |
| |
| def plaintextState(self): |
| data = self.stream.char() |
| if data == EOF: |
| # Tokenization ends. |
| return False |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
| data + self.stream.charsUntil("\u0000")}) |
| return True |
| |
| def tagOpenState(self): |
| data = self.stream.char() |
| if data == "!": |
| self.state = self.markupDeclarationOpenState |
| elif data == "/": |
| self.state = self.closeTagOpenState |
| elif data in asciiLetters: |
| self.currentToken = {"type": tokenTypes["StartTag"], |
| "name": data, "data": [], |
| "selfClosing": False, |
| "selfClosingAcknowledged": False} |
| self.state = self.tagNameState |
| elif data == ">": |
| # XXX In theory it could be something besides a tag name. But |
| # do we really care? |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-tag-name-but-got-right-bracket"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) |
| self.state = self.dataState |
| elif data == "?": |
| # XXX In theory it could be something besides a tag name. But |
| # do we really care? |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-tag-name-but-got-question-mark"}) |
| self.stream.unget(data) |
| self.state = self.bogusCommentState |
| else: |
| # XXX |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-tag-name"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.stream.unget(data) |
| self.state = self.dataState |
| return True |
| |
| def closeTagOpenState(self): |
| data = self.stream.char() |
| if data in asciiLetters: |
| self.currentToken = {"type": tokenTypes["EndTag"], "name": data, |
| "data": [], "selfClosing": False} |
| self.state = self.tagNameState |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-closing-tag-but-got-right-bracket"}) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-closing-tag-but-got-eof"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
| self.state = self.dataState |
| else: |
| # XXX data can be _'_... |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-closing-tag-but-got-char", |
| "datavars": {"data": data}}) |
| self.stream.unget(data) |
| self.state = self.bogusCommentState |
| return True |
| |
| def tagNameState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.state = self.beforeAttributeNameState |
| elif data == ">": |
| self.emitCurrentToken() |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-tag-name"}) |
| self.state = self.dataState |
| elif data == "/": |
| self.state = self.selfClosingStartTagState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["name"] += "\uFFFD" |
| else: |
| self.currentToken["name"] += data |
| # (Don't use charsUntil here, because tag names are |
| # very short and it's faster to not do anything fancy) |
| return True |
| |
| def rcdataLessThanSignState(self): |
| data = self.stream.char() |
| if data == "/": |
| self.temporaryBuffer = "" |
| self.state = self.rcdataEndTagOpenState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.stream.unget(data) |
| self.state = self.rcdataState |
| return True |
| |
| def rcdataEndTagOpenState(self): |
| data = self.stream.char() |
| if data in asciiLetters: |
| self.temporaryBuffer += data |
| self.state = self.rcdataEndTagNameState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
| self.stream.unget(data) |
| self.state = self.rcdataState |
| return True |
| |
| def rcdataEndTagNameState(self): |
| appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
| data = self.stream.char() |
| if data in spaceCharacters and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.beforeAttributeNameState |
| elif data == "/" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.selfClosingStartTagState |
| elif data == ">" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.emitCurrentToken() |
| self.state = self.dataState |
| elif data in asciiLetters: |
| self.temporaryBuffer += data |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "</" + self.temporaryBuffer}) |
| self.stream.unget(data) |
| self.state = self.rcdataState |
| return True |
| |
| def rawtextLessThanSignState(self): |
| data = self.stream.char() |
| if data == "/": |
| self.temporaryBuffer = "" |
| self.state = self.rawtextEndTagOpenState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.stream.unget(data) |
| self.state = self.rawtextState |
| return True |
| |
| def rawtextEndTagOpenState(self): |
| data = self.stream.char() |
| if data in asciiLetters: |
| self.temporaryBuffer += data |
| self.state = self.rawtextEndTagNameState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
| self.stream.unget(data) |
| self.state = self.rawtextState |
| return True |
| |
| def rawtextEndTagNameState(self): |
| appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
| data = self.stream.char() |
| if data in spaceCharacters and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.beforeAttributeNameState |
| elif data == "/" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.selfClosingStartTagState |
| elif data == ">" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.emitCurrentToken() |
| self.state = self.dataState |
| elif data in asciiLetters: |
| self.temporaryBuffer += data |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "</" + self.temporaryBuffer}) |
| self.stream.unget(data) |
| self.state = self.rawtextState |
| return True |
| |
| def scriptDataLessThanSignState(self): |
| data = self.stream.char() |
| if data == "/": |
| self.temporaryBuffer = "" |
| self.state = self.scriptDataEndTagOpenState |
| elif data == "!": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) |
| self.state = self.scriptDataEscapeStartState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.stream.unget(data) |
| self.state = self.scriptDataState |
| return True |
| |
| def scriptDataEndTagOpenState(self): |
| data = self.stream.char() |
| if data in asciiLetters: |
| self.temporaryBuffer += data |
| self.state = self.scriptDataEndTagNameState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
| self.stream.unget(data) |
| self.state = self.scriptDataState |
| return True |
| |
| def scriptDataEndTagNameState(self): |
| appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
| data = self.stream.char() |
| if data in spaceCharacters and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.beforeAttributeNameState |
| elif data == "/" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.selfClosingStartTagState |
| elif data == ">" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.emitCurrentToken() |
| self.state = self.dataState |
| elif data in asciiLetters: |
| self.temporaryBuffer += data |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "</" + self.temporaryBuffer}) |
| self.stream.unget(data) |
| self.state = self.scriptDataState |
| return True |
| |
| def scriptDataEscapeStartState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| self.state = self.scriptDataEscapeStartDashState |
| else: |
| self.stream.unget(data) |
| self.state = self.scriptDataState |
| return True |
| |
| def scriptDataEscapeStartDashState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| self.state = self.scriptDataEscapedDashDashState |
| else: |
| self.stream.unget(data) |
| self.state = self.scriptDataState |
| return True |
| |
| def scriptDataEscapedState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| self.state = self.scriptDataEscapedDashState |
| elif data == "<": |
| self.state = self.scriptDataEscapedLessThanSignState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| elif data == EOF: |
| self.state = self.dataState |
| else: |
| chars = self.stream.charsUntil(("<", "-", "\u0000")) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": |
| data + chars}) |
| return True |
| |
| def scriptDataEscapedDashState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| self.state = self.scriptDataEscapedDashDashState |
| elif data == "<": |
| self.state = self.scriptDataEscapedLessThanSignState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| self.state = self.scriptDataEscapedState |
| elif data == EOF: |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| self.state = self.scriptDataEscapedState |
| return True |
| |
| def scriptDataEscapedDashDashState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| elif data == "<": |
| self.state = self.scriptDataEscapedLessThanSignState |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) |
| self.state = self.scriptDataState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| self.state = self.scriptDataEscapedState |
| elif data == EOF: |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| self.state = self.scriptDataEscapedState |
| return True |
| |
| def scriptDataEscapedLessThanSignState(self): |
| data = self.stream.char() |
| if data == "/": |
| self.temporaryBuffer = "" |
| self.state = self.scriptDataEscapedEndTagOpenState |
| elif data in asciiLetters: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) |
| self.temporaryBuffer = data |
| self.state = self.scriptDataDoubleEscapeStartState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.stream.unget(data) |
| self.state = self.scriptDataEscapedState |
| return True |
| |
| def scriptDataEscapedEndTagOpenState(self): |
| data = self.stream.char() |
| if data in asciiLetters: |
| self.temporaryBuffer = data |
| self.state = self.scriptDataEscapedEndTagNameState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) |
| self.stream.unget(data) |
| self.state = self.scriptDataEscapedState |
| return True |
| |
| def scriptDataEscapedEndTagNameState(self): |
| appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() |
| data = self.stream.char() |
| if data in spaceCharacters and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.beforeAttributeNameState |
| elif data == "/" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.state = self.selfClosingStartTagState |
| elif data == ">" and appropriate: |
| self.currentToken = {"type": tokenTypes["EndTag"], |
| "name": self.temporaryBuffer, |
| "data": [], "selfClosing": False} |
| self.emitCurrentToken() |
| self.state = self.dataState |
| elif data in asciiLetters: |
| self.temporaryBuffer += data |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "</" + self.temporaryBuffer}) |
| self.stream.unget(data) |
| self.state = self.scriptDataEscapedState |
| return True |
| |
| def scriptDataDoubleEscapeStartState(self): |
| data = self.stream.char() |
| if data in (spaceCharacters | frozenset(("/", ">"))): |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| if self.temporaryBuffer.lower() == "script": |
| self.state = self.scriptDataDoubleEscapedState |
| else: |
| self.state = self.scriptDataEscapedState |
| elif data in asciiLetters: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| self.temporaryBuffer += data |
| else: |
| self.stream.unget(data) |
| self.state = self.scriptDataEscapedState |
| return True |
| |
| def scriptDataDoubleEscapedState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| self.state = self.scriptDataDoubleEscapedDashState |
| elif data == "<": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.state = self.scriptDataDoubleEscapedLessThanSignState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| elif data == EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-script-in-script"}) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| return True |
| |
| def scriptDataDoubleEscapedDashState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| self.state = self.scriptDataDoubleEscapedDashDashState |
| elif data == "<": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.state = self.scriptDataDoubleEscapedLessThanSignState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| self.state = self.scriptDataDoubleEscapedState |
| elif data == EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-script-in-script"}) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| self.state = self.scriptDataDoubleEscapedState |
| return True |
| |
| def scriptDataDoubleEscapedDashDashState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) |
| elif data == "<": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) |
| self.state = self.scriptDataDoubleEscapedLessThanSignState |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) |
| self.state = self.scriptDataState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": "\uFFFD"}) |
| self.state = self.scriptDataDoubleEscapedState |
| elif data == EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-script-in-script"}) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| self.state = self.scriptDataDoubleEscapedState |
| return True |
| |
| def scriptDataDoubleEscapedLessThanSignState(self): |
| data = self.stream.char() |
| if data == "/": |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) |
| self.temporaryBuffer = "" |
| self.state = self.scriptDataDoubleEscapeEndState |
| else: |
| self.stream.unget(data) |
| self.state = self.scriptDataDoubleEscapedState |
| return True |
| |
| def scriptDataDoubleEscapeEndState(self): |
| data = self.stream.char() |
| if data in (spaceCharacters | frozenset(("/", ">"))): |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| if self.temporaryBuffer.lower() == "script": |
| self.state = self.scriptDataEscapedState |
| else: |
| self.state = self.scriptDataDoubleEscapedState |
| elif data in asciiLetters: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) |
| self.temporaryBuffer += data |
| else: |
| self.stream.unget(data) |
| self.state = self.scriptDataDoubleEscapedState |
| return True |
| |
| def beforeAttributeNameState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.stream.charsUntil(spaceCharacters, True) |
| elif data in asciiLetters: |
| self.currentToken["data"].append([data, ""]) |
| self.state = self.attributeNameState |
| elif data == ">": |
| self.emitCurrentToken() |
| elif data == "/": |
| self.state = self.selfClosingStartTagState |
| elif data in ("'", '"', "=", "<"): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "invalid-character-in-attribute-name"}) |
| self.currentToken["data"].append([data, ""]) |
| self.state = self.attributeNameState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"].append(["\uFFFD", ""]) |
| self.state = self.attributeNameState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-attribute-name-but-got-eof"}) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"].append([data, ""]) |
| self.state = self.attributeNameState |
| return True |
| |
| def attributeNameState(self): |
| data = self.stream.char() |
| leavingThisState = True |
| emitToken = False |
| if data == "=": |
| self.state = self.beforeAttributeValueState |
| elif data in asciiLetters: |
| self.currentToken["data"][-1][0] += data +\ |
| self.stream.charsUntil(asciiLetters, True) |
| leavingThisState = False |
| elif data == ">": |
| # XXX If we emit here the attributes are converted to a dict |
| # without being checked and when the code below runs we error |
| # because data is a dict not a list |
| emitToken = True |
| elif data in spaceCharacters: |
| self.state = self.afterAttributeNameState |
| elif data == "/": |
| self.state = self.selfClosingStartTagState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"][-1][0] += "\uFFFD" |
| leavingThisState = False |
| elif data in ("'", '"', "<"): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": |
| "invalid-character-in-attribute-name"}) |
| self.currentToken["data"][-1][0] += data |
| leavingThisState = False |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "eof-in-attribute-name"}) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"][-1][0] += data |
| leavingThisState = False |
| |
| if leavingThisState: |
| # Attributes are not dropped at this stage. That happens when the |
| # start tag token is emitted so values can still be safely appended |
| # to attributes, but we do want to report the parse error in time. |
| if self.lowercaseAttrName: |
| self.currentToken["data"][-1][0] = ( |
| self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) |
| for name, value in self.currentToken["data"][:-1]: |
| if self.currentToken["data"][-1][0] == name: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "duplicate-attribute"}) |
| break |
| # XXX Fix for above XXX |
| if emitToken: |
| self.emitCurrentToken() |
| return True |
| |
| def afterAttributeNameState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.stream.charsUntil(spaceCharacters, True) |
| elif data == "=": |
| self.state = self.beforeAttributeValueState |
| elif data == ">": |
| self.emitCurrentToken() |
| elif data in asciiLetters: |
| self.currentToken["data"].append([data, ""]) |
| self.state = self.attributeNameState |
| elif data == "/": |
| self.state = self.selfClosingStartTagState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"].append(["\uFFFD", ""]) |
| self.state = self.attributeNameState |
| elif data in ("'", '"', "<"): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "invalid-character-after-attribute-name"}) |
| self.currentToken["data"].append([data, ""]) |
| self.state = self.attributeNameState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-end-of-tag-but-got-eof"}) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"].append([data, ""]) |
| self.state = self.attributeNameState |
| return True |
| |
| def beforeAttributeValueState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.stream.charsUntil(spaceCharacters, True) |
| elif data == "\"": |
| self.state = self.attributeValueDoubleQuotedState |
| elif data == "&": |
| self.state = self.attributeValueUnQuotedState |
| self.stream.unget(data) |
| elif data == "'": |
| self.state = self.attributeValueSingleQuotedState |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-attribute-value-but-got-right-bracket"}) |
| self.emitCurrentToken() |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"][-1][1] += "\uFFFD" |
| self.state = self.attributeValueUnQuotedState |
| elif data in ("=", "<", "`"): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "equals-in-unquoted-attribute-value"}) |
| self.currentToken["data"][-1][1] += data |
| self.state = self.attributeValueUnQuotedState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-attribute-value-but-got-eof"}) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"][-1][1] += data |
| self.state = self.attributeValueUnQuotedState |
| return True |
| |
| def attributeValueDoubleQuotedState(self): |
| data = self.stream.char() |
| if data == "\"": |
| self.state = self.afterAttributeValueState |
| elif data == "&": |
| self.processEntityInAttribute('"') |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"][-1][1] += "\uFFFD" |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-attribute-value-double-quote"}) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"][-1][1] += data +\ |
| self.stream.charsUntil(("\"", "&", "\u0000")) |
| return True |
| |
| def attributeValueSingleQuotedState(self): |
| data = self.stream.char() |
| if data == "'": |
| self.state = self.afterAttributeValueState |
| elif data == "&": |
| self.processEntityInAttribute("'") |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"][-1][1] += "\uFFFD" |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-attribute-value-single-quote"}) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"][-1][1] += data +\ |
| self.stream.charsUntil(("'", "&", "\u0000")) |
| return True |
| |
| def attributeValueUnQuotedState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.state = self.beforeAttributeNameState |
| elif data == "&": |
| self.processEntityInAttribute(">") |
| elif data == ">": |
| self.emitCurrentToken() |
| elif data in ('"', "'", "=", "<", "`"): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-character-in-unquoted-attribute-value"}) |
| self.currentToken["data"][-1][1] += data |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"][-1][1] += "\uFFFD" |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-attribute-value-no-quotes"}) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"][-1][1] += data + self.stream.charsUntil( |
| frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) |
| return True |
| |
| def afterAttributeValueState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.state = self.beforeAttributeNameState |
| elif data == ">": |
| self.emitCurrentToken() |
| elif data == "/": |
| self.state = self.selfClosingStartTagState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-EOF-after-attribute-value"}) |
| self.stream.unget(data) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-character-after-attribute-value"}) |
| self.stream.unget(data) |
| self.state = self.beforeAttributeNameState |
| return True |
| |
| def selfClosingStartTagState(self): |
| data = self.stream.char() |
| if data == ">": |
| self.currentToken["selfClosing"] = True |
| self.emitCurrentToken() |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": |
| "unexpected-EOF-after-solidus-in-tag"}) |
| self.stream.unget(data) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-character-after-solidus-in-tag"}) |
| self.stream.unget(data) |
| self.state = self.beforeAttributeNameState |
| return True |
| |
| def bogusCommentState(self): |
| # Make a new comment token and give it as value all the characters |
| # until the first > or EOF (charsUntil checks for EOF automatically) |
| # and emit it. |
| data = self.stream.charsUntil(">") |
| data = data.replace("\u0000", "\uFFFD") |
| self.tokenQueue.append( |
| {"type": tokenTypes["Comment"], "data": data}) |
| |
| # Eat the character directly after the bogus comment which is either a |
| # ">" or an EOF. |
| self.stream.char() |
| self.state = self.dataState |
| return True |
| |
| def markupDeclarationOpenState(self): |
| charStack = [self.stream.char()] |
| if charStack[-1] == "-": |
| charStack.append(self.stream.char()) |
| if charStack[-1] == "-": |
| self.currentToken = {"type": tokenTypes["Comment"], "data": ""} |
| self.state = self.commentStartState |
| return True |
| elif charStack[-1] in ('d', 'D'): |
| matched = True |
| for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), |
| ('y', 'Y'), ('p', 'P'), ('e', 'E')): |
| charStack.append(self.stream.char()) |
| if charStack[-1] not in expected: |
| matched = False |
| break |
| if matched: |
| self.currentToken = {"type": tokenTypes["Doctype"], |
| "name": "", |
| "publicId": None, "systemId": None, |
| "correct": True} |
| self.state = self.doctypeState |
| return True |
| elif (charStack[-1] == "[" and |
| self.parser is not None and |
| self.parser.tree.openElements and |
| self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): |
| matched = True |
| for expected in ["C", "D", "A", "T", "A", "["]: |
| charStack.append(self.stream.char()) |
| if charStack[-1] != expected: |
| matched = False |
| break |
| if matched: |
| self.state = self.cdataSectionState |
| return True |
| |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-dashes-or-doctype"}) |
| |
| while charStack: |
| self.stream.unget(charStack.pop()) |
| self.state = self.bogusCommentState |
| return True |
| |
| def commentStartState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.state = self.commentStartDashState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"] += "\uFFFD" |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "incorrect-comment"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-comment"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"] += data |
| self.state = self.commentState |
| return True |
| |
| def commentStartDashState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.state = self.commentEndState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"] += "-\uFFFD" |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "incorrect-comment"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-comment"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"] += "-" + data |
| self.state = self.commentState |
| return True |
| |
| def commentState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.state = self.commentEndDashState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"] += "\uFFFD" |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "eof-in-comment"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"] += data + \ |
| self.stream.charsUntil(("-", "\u0000")) |
| return True |
| |
| def commentEndDashState(self): |
| data = self.stream.char() |
| if data == "-": |
| self.state = self.commentEndState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"] += "-\uFFFD" |
| self.state = self.commentState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-comment-end-dash"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"] += "-" + data |
| self.state = self.commentState |
| return True |
| |
| def commentEndState(self): |
| data = self.stream.char() |
| if data == ">": |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"] += "--\uFFFD" |
| self.state = self.commentState |
| elif data == "!": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-bang-after-double-dash-in-comment"}) |
| self.state = self.commentEndBangState |
| elif data == "-": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-dash-after-double-dash-in-comment"}) |
| self.currentToken["data"] += data |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-comment-double-dash"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| # XXX |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-comment"}) |
| self.currentToken["data"] += "--" + data |
| self.state = self.commentState |
| return True |
| |
| def commentEndBangState(self): |
| data = self.stream.char() |
| if data == ">": |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data == "-": |
| self.currentToken["data"] += "--!" |
| self.state = self.commentEndDashState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["data"] += "--!\uFFFD" |
| self.state = self.commentState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-comment-end-bang-state"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["data"] += "--!" + data |
| self.state = self.commentState |
| return True |
| |
| def doctypeState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.state = self.beforeDoctypeNameState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-doctype-name-but-got-eof"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "need-space-after-doctype"}) |
| self.stream.unget(data) |
| self.state = self.beforeDoctypeNameState |
| return True |
| |
| def beforeDoctypeNameState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| pass |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-doctype-name-but-got-right-bracket"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["name"] = "\uFFFD" |
| self.state = self.doctypeNameState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-doctype-name-but-got-eof"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["name"] = data |
| self.state = self.doctypeNameState |
| return True |
| |
| def doctypeNameState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) |
| self.state = self.afterDoctypeNameState |
| elif data == ">": |
| self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["name"] += "\uFFFD" |
| self.state = self.doctypeNameState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype-name"}) |
| self.currentToken["correct"] = False |
| self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["name"] += data |
| return True |
| |
| def afterDoctypeNameState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| pass |
| elif data == ">": |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.currentToken["correct"] = False |
| self.stream.unget(data) |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| if data in ("p", "P"): |
| matched = True |
| for expected in (("u", "U"), ("b", "B"), ("l", "L"), |
| ("i", "I"), ("c", "C")): |
| data = self.stream.char() |
| if data not in expected: |
| matched = False |
| break |
| if matched: |
| self.state = self.afterDoctypePublicKeywordState |
| return True |
| elif data in ("s", "S"): |
| matched = True |
| for expected in (("y", "Y"), ("s", "S"), ("t", "T"), |
| ("e", "E"), ("m", "M")): |
| data = self.stream.char() |
| if data not in expected: |
| matched = False |
| break |
| if matched: |
| self.state = self.afterDoctypeSystemKeywordState |
| return True |
| |
| # All the characters read before the current 'data' will be |
| # [a-zA-Z], so they're garbage in the bogus doctype and can be |
| # discarded; only the latest character might be '>' or EOF |
| # and needs to be ungetted |
| self.stream.unget(data) |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "expected-space-or-right-bracket-in-doctype", "datavars": |
| {"data": data}}) |
| self.currentToken["correct"] = False |
| self.state = self.bogusDoctypeState |
| |
| return True |
| |
| def afterDoctypePublicKeywordState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.state = self.beforeDoctypePublicIdentifierState |
| elif data in ("'", '"'): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.stream.unget(data) |
| self.state = self.beforeDoctypePublicIdentifierState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.stream.unget(data) |
| self.state = self.beforeDoctypePublicIdentifierState |
| return True |
| |
| def beforeDoctypePublicIdentifierState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| pass |
| elif data == "\"": |
| self.currentToken["publicId"] = "" |
| self.state = self.doctypePublicIdentifierDoubleQuotedState |
| elif data == "'": |
| self.currentToken["publicId"] = "" |
| self.state = self.doctypePublicIdentifierSingleQuotedState |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-end-of-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.state = self.bogusDoctypeState |
| return True |
| |
| def doctypePublicIdentifierDoubleQuotedState(self): |
| data = self.stream.char() |
| if data == "\"": |
| self.state = self.afterDoctypePublicIdentifierState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["publicId"] += "\uFFFD" |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-end-of-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["publicId"] += data |
| return True |
| |
| def doctypePublicIdentifierSingleQuotedState(self): |
| data = self.stream.char() |
| if data == "'": |
| self.state = self.afterDoctypePublicIdentifierState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["publicId"] += "\uFFFD" |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-end-of-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["publicId"] += data |
| return True |
| |
| def afterDoctypePublicIdentifierState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.state = self.betweenDoctypePublicAndSystemIdentifiersState |
| elif data == ">": |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data == '"': |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.currentToken["systemId"] = "" |
| self.state = self.doctypeSystemIdentifierDoubleQuotedState |
| elif data == "'": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.currentToken["systemId"] = "" |
| self.state = self.doctypeSystemIdentifierSingleQuotedState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.state = self.bogusDoctypeState |
| return True |
| |
| def betweenDoctypePublicAndSystemIdentifiersState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| pass |
| elif data == ">": |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data == '"': |
| self.currentToken["systemId"] = "" |
| self.state = self.doctypeSystemIdentifierDoubleQuotedState |
| elif data == "'": |
| self.currentToken["systemId"] = "" |
| self.state = self.doctypeSystemIdentifierSingleQuotedState |
| elif data == EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.state = self.bogusDoctypeState |
| return True |
| |
| def afterDoctypeSystemKeywordState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| self.state = self.beforeDoctypeSystemIdentifierState |
| elif data in ("'", '"'): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.stream.unget(data) |
| self.state = self.beforeDoctypeSystemIdentifierState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.stream.unget(data) |
| self.state = self.beforeDoctypeSystemIdentifierState |
| return True |
| |
| def beforeDoctypeSystemIdentifierState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| pass |
| elif data == "\"": |
| self.currentToken["systemId"] = "" |
| self.state = self.doctypeSystemIdentifierDoubleQuotedState |
| elif data == "'": |
| self.currentToken["systemId"] = "" |
| self.state = self.doctypeSystemIdentifierSingleQuotedState |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.state = self.bogusDoctypeState |
| return True |
| |
| def doctypeSystemIdentifierDoubleQuotedState(self): |
| data = self.stream.char() |
| if data == "\"": |
| self.state = self.afterDoctypeSystemIdentifierState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["systemId"] += "\uFFFD" |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-end-of-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["systemId"] += data |
| return True |
| |
| def doctypeSystemIdentifierSingleQuotedState(self): |
| data = self.stream.char() |
| if data == "'": |
| self.state = self.afterDoctypeSystemIdentifierState |
| elif data == "\u0000": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| self.currentToken["systemId"] += "\uFFFD" |
| elif data == ">": |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-end-of-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.currentToken["systemId"] += data |
| return True |
| |
| def afterDoctypeSystemIdentifierState(self): |
| data = self.stream.char() |
| if data in spaceCharacters: |
| pass |
| elif data == ">": |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "eof-in-doctype"}) |
| self.currentToken["correct"] = False |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": |
| "unexpected-char-in-doctype"}) |
| self.state = self.bogusDoctypeState |
| return True |
| |
| def bogusDoctypeState(self): |
| data = self.stream.char() |
| if data == ">": |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| elif data is EOF: |
| # XXX EMIT |
| self.stream.unget(data) |
| self.tokenQueue.append(self.currentToken) |
| self.state = self.dataState |
| else: |
| pass |
| return True |
| |
| def cdataSectionState(self): |
| data = [] |
| while True: |
| data.append(self.stream.charsUntil("]")) |
| data.append(self.stream.charsUntil(">")) |
| char = self.stream.char() |
| if char == EOF: |
| break |
| else: |
| assert char == ">" |
| if data[-1][-2:] == "]]": |
| data[-1] = data[-1][:-2] |
| break |
| else: |
| data.append(char) |
| |
| data = "".join(data) |
| # Deal with null here rather than in the parser |
| nullCount = data.count("\u0000") |
| if nullCount > 0: |
| for i in range(nullCount): |
| self.tokenQueue.append({"type": tokenTypes["ParseError"], |
| "data": "invalid-codepoint"}) |
| data = data.replace("\u0000", "\uFFFD") |
| if data: |
| self.tokenQueue.append({"type": tokenTypes["Characters"], |
| "data": data}) |
| self.state = self.dataState |
| return True |