From 68df54d6629ec019142eb149dd037774f2d11e7c Mon Sep 17 00:00:00 2001 From: Shubham Saini Date: Tue, 11 Dec 2018 15:31:23 +0530 Subject: First commit --- .../pip/_vendor/html5lib/_tokenizer.py | 1721 ++++++++++++++++++++ 1 file changed, 1721 insertions(+) create mode 100644 venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py') diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py new file mode 100644 index 0000000..ef1ccf8 --- /dev/null +++ b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py @@ -0,0 +1,1721 @@ +from __future__ import absolute_import, division, unicode_literals + +from pip._vendor.six import unichr as chr + +from collections import deque + +from .constants import spaceCharacters +from .constants import entities +from .constants import asciiLetters, asciiUpper2Lower +from .constants import digits, hexDigits, EOF +from .constants import tokenTypes, tagTokenTypes +from .constants import replacementCharacters + +from ._inputstream import HTMLInputStream + +from ._trie import Trie + +entitiesTrie = Trie(entities) + + +class HTMLTokenizer(object): + """ This class takes care of tokenizing HTML. + + * self.currentToken + Holds the token that is currently being processed. + + * self.state + Holds a reference to the method to be invoked... XXX + + * self.stream + Points to HTMLInputStream object. + """ + + def __init__(self, stream, parser=None, **kwargs): + + self.stream = HTMLInputStream(stream, **kwargs) + self.parser = parser + + # Setup the initial tokenizer state + self.escapeFlag = False + self.lastFourChars = [] + self.state = self.dataState + self.escape = False + + # The current token being created + self.currentToken = None + super(HTMLTokenizer, self).__init__() + + def __iter__(self): + """ This is where the magic happens. + + We do our usually processing through the states and when we have a token + to return we yield the token which pauses processing until the next token + is requested. + """ + self.tokenQueue = deque([]) + # Start processing. When EOF is reached self.state will return False + # instead of True and the loop will terminate. + while self.state(): + while self.stream.errors: + yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} + while self.tokenQueue: + yield self.tokenQueue.popleft() + + def consumeNumberEntity(self, isHex): + """This function returns either U+FFFD or the character based on the + decimal or hexadecimal representation. It also discards ";" if present. + If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. + """ + + allowed = digits + radix = 10 + if isHex: + allowed = hexDigits + radix = 16 + + charStack = [] + + # Consume all the characters that are in range while making sure we + # don't hit an EOF. + c = self.stream.char() + while c in allowed and c is not EOF: + charStack.append(c) + c = self.stream.char() + + # Convert the set of characters consumed to an int. + charAsInt = int("".join(charStack), radix) + + # Certain characters get replaced with others + if charAsInt in replacementCharacters: + char = replacementCharacters[charAsInt] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + elif ((0xD800 <= charAsInt <= 0xDFFF) or + (charAsInt > 0x10FFFF)): + char = "\uFFFD" + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + else: + # Should speed up this check somehow (e.g. move the set to a constant) + if ((0x0001 <= charAsInt <= 0x0008) or + (0x000E <= charAsInt <= 0x001F) or + (0x007F <= charAsInt <= 0x009F) or + (0xFDD0 <= charAsInt <= 0xFDEF) or + charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, + 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, + 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, + 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, + 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, + 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, + 0xFFFFF, 0x10FFFE, 0x10FFFF])): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + try: + # Try/except needed as UCS-2 Python builds' unichar only works + # within the BMP. + char = chr(charAsInt) + except ValueError: + v = charAsInt - 0x10000 + char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) + + # Discard the ; if present. Otherwise, put it back on the queue and + # invoke parseError on parser. + if c != ";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "numeric-entity-without-semicolon"}) + self.stream.unget(c) + + return char + + def consumeEntity(self, allowedChar=None, fromAttribute=False): + # Initialise to the default output for when no entity is matched + output = "&" + + charStack = [self.stream.char()] + if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or + (allowedChar is not None and allowedChar == charStack[0])): + self.stream.unget(charStack[0]) + + elif charStack[0] == "#": + # Read the next character to see if it's hex or decimal + hex = False + charStack.append(self.stream.char()) + if charStack[-1] in ("x", "X"): + hex = True + charStack.append(self.stream.char()) + + # charStack[-1] should be the first digit + if (hex and charStack[-1] in hexDigits) \ + or (not hex and charStack[-1] in digits): + # At least one digit found, so consume the whole number + self.stream.unget(charStack[-1]) + output = self.consumeNumberEntity(hex) + else: + # No digits found + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "expected-numeric-entity"}) + self.stream.unget(charStack.pop()) + output = "&" + "".join(charStack) + + else: + # At this point in the process might have named entity. Entities + # are stored in the global variable "entities". + # + # Consume characters and compare to these to a substring of the + # entity names in the list until the substring no longer matches. + while (charStack[-1] is not EOF): + if not entitiesTrie.has_keys_with_prefix("".join(charStack)): + break + charStack.append(self.stream.char()) + + # At this point we have a string that starts with some characters + # that may match an entity + # Try to find the longest entity the string will match to take care + # of ¬i for instance. + try: + entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) + entityLength = len(entityName) + except KeyError: + entityName = None + + if entityName is not None: + if entityName[-1] != ";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "named-entity-without-semicolon"}) + if (entityName[-1] != ";" and fromAttribute and + (charStack[entityLength] in asciiLetters or + charStack[entityLength] in digits or + charStack[entityLength] == "=")): + self.stream.unget(charStack.pop()) + output = "&" + "".join(charStack) + else: + output = entities[entityName] + self.stream.unget(charStack.pop()) + output += "".join(charStack[entityLength:]) + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-named-entity"}) + self.stream.unget(charStack.pop()) + output = "&" + "".join(charStack) + + if fromAttribute: + self.currentToken["data"][-1][1] += output + else: + if output in spaceCharacters: + tokenType = "SpaceCharacters" + else: + tokenType = "Characters" + self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) + + def processEntityInAttribute(self, allowedChar): + """This method replaces the need for "entityInAttributeValueState". + """ + self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) + + def emitCurrentToken(self): + """This method is a generic handler for emitting the tags. It also sets + the state to "data" because that's what's needed after a token has been + emitted. + """ + token = self.currentToken + # Add token to the queue to be yielded + if (token["type"] in tagTokenTypes): + token["name"] = token["name"].translate(asciiUpper2Lower) + if token["type"] == tokenTypes["EndTag"]: + if token["data"]: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "attributes-in-end-tag"}) + if token["selfClosing"]: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "self-closing-flag-on-end-tag"}) + self.tokenQueue.append(token) + self.state = self.dataState + + # Below are the various tokenizer states worked out. + def dataState(self): + data = self.stream.char() + if data == "&": + self.state = self.entityDataState + elif data == "<": + self.state = self.tagOpenState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\u0000"}) + elif data is EOF: + # Tokenization ends. + return False + elif data in spaceCharacters: + # Directly after emitting a token you switch back to the "data + # state". At that point spaceCharacters are important so they are + # emitted separately. + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": + data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any sequences + else: + chars = self.stream.charsUntil(("&", "<", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def entityDataState(self): + self.consumeEntity() + self.state = self.dataState + return True + + def rcdataState(self): + data = self.stream.char() + if data == "&": + self.state = self.characterReferenceInRcdata + elif data == "<": + self.state = self.rcdataLessThanSignState + elif data == EOF: + # Tokenization ends. + return False + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + elif data in spaceCharacters: + # Directly after emitting a token you switch back to the "data + # state". At that point spaceCharacters are important so they are + # emitted separately. + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": + data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any sequences + else: + chars = self.stream.charsUntil(("&", "<", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def characterReferenceInRcdata(self): + self.consumeEntity() + self.state = self.rcdataState + return True + + def rawtextState(self): + data = self.stream.char() + if data == "<": + self.state = self.rawtextLessThanSignState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + elif data == EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil(("<", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def scriptDataState(self): + data = self.stream.char() + if data == "<": + self.state = self.scriptDataLessThanSignState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + elif data == EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil(("<", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def plaintextState(self): + data = self.stream.char() + if data == EOF: + # Tokenization ends. + return False + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + self.stream.charsUntil("\u0000")}) + return True + + def tagOpenState(self): + data = self.stream.char() + if data == "!": + self.state = self.markupDeclarationOpenState + elif data == "/": + self.state = self.closeTagOpenState + elif data in asciiLetters: + self.currentToken = {"type": tokenTypes["StartTag"], + "name": data, "data": [], + "selfClosing": False, + "selfClosingAcknowledged": False} + self.state = self.tagNameState + elif data == ">": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-right-bracket"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) + self.state = self.dataState + elif data == "?": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-question-mark"}) + self.stream.unget(data) + self.state = self.bogusCommentState + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.stream.unget(data) + self.state = self.dataState + return True + + def closeTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.currentToken = {"type": tokenTypes["EndTag"], "name": data, + "data": [], "selfClosing": False} + self.state = self.tagNameState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-right-bracket"}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-eof"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "": + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-tag-name"}) + self.state = self.dataState + elif data == "/": + self.state = self.selfClosingStartTagState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["name"] += "\uFFFD" + else: + self.currentToken["name"] += data + # (Don't use charsUntil here, because tag names are + # very short and it's faster to not do anything fancy) + return True + + def rcdataLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.rcdataEndTagOpenState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.stream.unget(data) + self.state = self.rcdataState + return True + + def rcdataEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer += data + self.state = self.rcdataEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing": False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing": False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing": False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) + self.state = self.scriptDataState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + self.state = self.scriptDataEscapedState + elif data == EOF: + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.scriptDataEscapedEndTagOpenState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) + self.temporaryBuffer = data + self.state = self.scriptDataDoubleEscapeStartState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer = data + self.state = self.scriptDataEscapedEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing": False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": ""))): + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + if self.temporaryBuffer.lower() == "script": + self.state = self.scriptDataDoubleEscapedState + else: + self.state = self.scriptDataEscapedState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.temporaryBuffer += data + else: + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataDoubleEscapedState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.state = self.scriptDataDoubleEscapedDashState + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + return True + + def scriptDataDoubleEscapedDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + self.state = self.scriptDataDoubleEscapedDashDashState + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + self.state = self.scriptDataDoubleEscapedState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapedDashDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) + self.state = self.scriptDataState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": "\uFFFD"}) + self.state = self.scriptDataDoubleEscapedState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapedLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) + self.temporaryBuffer = "" + self.state = self.scriptDataDoubleEscapeEndState + else: + self.stream.unget(data) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapeEndState(self): + data = self.stream.char() + if data in (spaceCharacters | frozenset(("/", ">"))): + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + if self.temporaryBuffer.lower() == "script": + self.state = self.scriptDataEscapedState + else: + self.state = self.scriptDataDoubleEscapedState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.temporaryBuffer += data + else: + self.stream.unget(data) + self.state = self.scriptDataDoubleEscapedState + return True + + def beforeAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == ">": + self.emitCurrentToken() + elif data == "/": + self.state = self.selfClosingStartTagState + elif data in ("'", '"', "=", "<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"].append(["\uFFFD", ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-name-but-got-eof"}) + self.state = self.dataState + else: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + return True + + def attributeNameState(self): + data = self.stream.char() + leavingThisState = True + emitToken = False + if data == "=": + self.state = self.beforeAttributeValueState + elif data in asciiLetters: + self.currentToken["data"][-1][0] += data +\ + self.stream.charsUntil(asciiLetters, True) + leavingThisState = False + elif data == ">": + # XXX If we emit here the attributes are converted to a dict + # without being checked and when the code below runs we error + # because data is a dict not a list + emitToken = True + elif data in spaceCharacters: + self.state = self.afterAttributeNameState + elif data == "/": + self.state = self.selfClosingStartTagState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][0] += "\uFFFD" + leavingThisState = False + elif data in ("'", '"', "<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"][-1][0] += data + leavingThisState = False + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "eof-in-attribute-name"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][0] += data + leavingThisState = False + + if leavingThisState: + # Attributes are not dropped at this stage. That happens when the + # start tag token is emitted so values can still be safely appended + # to attributes, but we do want to report the parse error in time. + self.currentToken["data"][-1][0] = ( + self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) + for name, _ in self.currentToken["data"][:-1]: + if self.currentToken["data"][-1][0] == name: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "duplicate-attribute"}) + break + # XXX Fix for above XXX + if emitToken: + self.emitCurrentToken() + return True + + def afterAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == "=": + self.state = self.beforeAttributeValueState + elif data == ">": + self.emitCurrentToken() + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == "/": + self.state = self.selfClosingStartTagState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"].append(["\uFFFD", ""]) + self.state = self.attributeNameState + elif data in ("'", '"', "<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-after-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-end-of-tag-but-got-eof"}) + self.state = self.dataState + else: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + return True + + def beforeAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == "\"": + self.state = self.attributeValueDoubleQuotedState + elif data == "&": + self.state = self.attributeValueUnQuotedState + self.stream.unget(data) + elif data == "'": + self.state = self.attributeValueSingleQuotedState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-right-bracket"}) + self.emitCurrentToken() + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += "\uFFFD" + self.state = self.attributeValueUnQuotedState + elif data in ("=", "<", "`"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "equals-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + self.state = self.attributeValueUnQuotedState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-eof"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data + self.state = self.attributeValueUnQuotedState + return True + + def attributeValueDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterAttributeValueState + elif data == "&": + self.processEntityInAttribute('"') + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += "\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-double-quote"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("\"", "&", "\u0000")) + return True + + def attributeValueSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterAttributeValueState + elif data == "&": + self.processEntityInAttribute("'") + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += "\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-single-quote"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("'", "&", "\u0000")) + return True + + def attributeValueUnQuotedState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == "&": + self.processEntityInAttribute(">") + elif data == ">": + self.emitCurrentToken() + elif data in ('"', "'", "=", "<", "`"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"][-1][1] += "\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-no-quotes"}) + self.state = self.dataState + else: + self.currentToken["data"][-1][1] += data + self.stream.charsUntil( + frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) + return True + + def afterAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == ">": + self.emitCurrentToken() + elif data == "/": + self.state = self.selfClosingStartTagState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-EOF-after-attribute-value"}) + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-attribute-value"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState + return True + + def selfClosingStartTagState(self): + data = self.stream.char() + if data == ">": + self.currentToken["selfClosing"] = True + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "unexpected-EOF-after-solidus-in-tag"}) + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-solidus-in-tag"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState + return True + + def bogusCommentState(self): + # Make a new comment token and give it as value all the characters + # until the first > or EOF (charsUntil checks for EOF automatically) + # and emit it. + data = self.stream.charsUntil(">") + data = data.replace("\u0000", "\uFFFD") + self.tokenQueue.append( + {"type": tokenTypes["Comment"], "data": data}) + + # Eat the character directly after the bogus comment which is either a + # ">" or an EOF. + self.stream.char() + self.state = self.dataState + return True + + def markupDeclarationOpenState(self): + charStack = [self.stream.char()] + if charStack[-1] == "-": + charStack.append(self.stream.char()) + if charStack[-1] == "-": + self.currentToken = {"type": tokenTypes["Comment"], "data": ""} + self.state = self.commentStartState + return True + elif charStack[-1] in ('d', 'D'): + matched = True + for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), + ('y', 'Y'), ('p', 'P'), ('e', 'E')): + charStack.append(self.stream.char()) + if charStack[-1] not in expected: + matched = False + break + if matched: + self.currentToken = {"type": tokenTypes["Doctype"], + "name": "", + "publicId": None, "systemId": None, + "correct": True} + self.state = self.doctypeState + return True + elif (charStack[-1] == "[" and + self.parser is not None and + self.parser.tree.openElements and + self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): + matched = True + for expected in ["C", "D", "A", "T", "A", "["]: + charStack.append(self.stream.char()) + if charStack[-1] != expected: + matched = False + break + if matched: + self.state = self.cdataSectionState + return True + + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-dashes-or-doctype"}) + + while charStack: + self.stream.unget(charStack.pop()) + self.state = self.bogusCommentState + return True + + def commentStartState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentStartDashState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += "\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + self.state = self.commentState + return True + + def commentStartDashState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentEndState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += "-\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += "-" + data + self.state = self.commentState + return True + + def commentState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentEndDashState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += "\uFFFD" + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + \ + self.stream.charsUntil(("-", "\u0000")) + return True + + def commentEndDashState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentEndState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += "-\uFFFD" + self.state = self.commentState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += "-" + data + self.state = self.commentState + return True + + def commentEndState(self): + data = self.stream.char() + if data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += "--\uFFFD" + self.state = self.commentState + elif data == "!": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-bang-after-double-dash-in-comment"}) + self.state = self.commentEndBangState + elif data == "-": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-dash-after-double-dash-in-comment"}) + self.currentToken["data"] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-double-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-comment"}) + self.currentToken["data"] += "--" + data + self.state = self.commentState + return True + + def commentEndBangState(self): + data = self.stream.char() + if data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == "-": + self.currentToken["data"] += "--!" + self.state = self.commentEndDashState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["data"] += "--!\uFFFD" + self.state = self.commentState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-bang-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += "--!" + data + self.state = self.commentState + return True + + def doctypeState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "need-space-after-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypeNameState + return True + + def beforeDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-right-bracket"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["name"] = "\uFFFD" + self.state = self.doctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["name"] = data + self.state = self.doctypeNameState + return True + + def doctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.state = self.afterDoctypeNameState + elif data == ">": + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["name"] += "\uFFFD" + self.state = self.doctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype-name"}) + self.currentToken["correct"] = False + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["name"] += data + return True + + def afterDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.currentToken["correct"] = False + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + if data in ("p", "P"): + matched = True + for expected in (("u", "U"), ("b", "B"), ("l", "L"), + ("i", "I"), ("c", "C")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.afterDoctypePublicKeywordState + return True + elif data in ("s", "S"): + matched = True + for expected in (("y", "Y"), ("s", "S"), ("t", "T"), + ("e", "E"), ("m", "M")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.afterDoctypeSystemKeywordState + return True + + # All the characters read before the current 'data' will be + # [a-zA-Z], so they're garbage in the bogus doctype and can be + # discarded; only the latest character might be '>' or EOF + # and needs to be ungetted + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-space-or-right-bracket-in-doctype", "datavars": + {"data": data}}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + + return True + + def afterDoctypePublicKeywordState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypePublicIdentifierState + elif data in ("'", '"'): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypePublicIdentifierState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.stream.unget(data) + self.state = self.beforeDoctypePublicIdentifierState + return True + + def beforeDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["publicId"] = "" + self.state = self.doctypePublicIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["publicId"] = "" + self.state = self.doctypePublicIdentifierSingleQuotedState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def doctypePublicIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterDoctypePublicIdentifierState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["publicId"] += "\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["publicId"] += data + return True + + def doctypePublicIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterDoctypePublicIdentifierState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["publicId"] += "\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["publicId"] += data + return True + + def afterDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.betweenDoctypePublicAndSystemIdentifiersState + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == '"': + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["systemId"] = "" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["systemId"] = "" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def betweenDoctypePublicAndSystemIdentifiersState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == '"': + self.currentToken["systemId"] = "" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["systemId"] = "" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def afterDoctypeSystemKeywordState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypeSystemIdentifierState + elif data in ("'", '"'): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypeSystemIdentifierState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.stream.unget(data) + self.state = self.beforeDoctypeSystemIdentifierState + return True + + def beforeDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["systemId"] = "" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["systemId"] = "" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def doctypeSystemIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterDoctypeSystemIdentifierState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["systemId"] += "\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["systemId"] += data + return True + + def doctypeSystemIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterDoctypeSystemIdentifierState + elif data == "\u0000": + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + self.currentToken["systemId"] += "\uFFFD" + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["systemId"] += data + return True + + def afterDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.state = self.bogusDoctypeState + return True + + def bogusDoctypeState(self): + data = self.stream.char() + if data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + # XXX EMIT + self.stream.unget(data) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + pass + return True + + def cdataSectionState(self): + data = [] + while True: + data.append(self.stream.charsUntil("]")) + data.append(self.stream.charsUntil(">")) + char = self.stream.char() + if char == EOF: + break + else: + assert char == ">" + if data[-1][-2:] == "]]": + data[-1] = data[-1][:-2] + break + else: + data.append(char) + + data = "".join(data) # pylint:disable=redefined-variable-type + # Deal with null here rather than in the parser + nullCount = data.count("\u0000") + if nullCount > 0: + for _ in range(nullCount): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + data = data.replace("\u0000", "\uFFFD") + if data: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": data}) + self.state = self.dataState + return True -- cgit v1.2.3