diff options
Diffstat (limited to 'venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py')
-rw-r--r-- | venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py | 1721 |
1 files changed, 0 insertions, 1721 deletions
diff --git a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py b/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py deleted file mode 100644 index ef1ccf8..0000000 --- a/venv/lib/python3.7/site-packages/pip-10.0.1-py3.7.egg/pip/_vendor/html5lib/_tokenizer.py +++ /dev/null | |||
@@ -1,1721 +0,0 @@ | |||
1 | from __future__ import absolute_import, division, unicode_literals | ||
2 | |||
3 | from pip._vendor.six import unichr as chr | ||
4 | |||
5 | from collections import deque | ||
6 | |||
7 | from .constants import spaceCharacters | ||
8 | from .constants import entities | ||
9 | from .constants import asciiLetters, asciiUpper2Lower | ||
10 | from .constants import digits, hexDigits, EOF | ||
11 | from .constants import tokenTypes, tagTokenTypes | ||
12 | from .constants import replacementCharacters | ||
13 | |||
14 | from ._inputstream import HTMLInputStream | ||
15 | |||
16 | from ._trie import Trie | ||
17 | |||
18 | entitiesTrie = Trie(entities) | ||
19 | |||
20 | |||
21 | class HTMLTokenizer(object): | ||
22 | """ This class takes care of tokenizing HTML. | ||
23 | |||
24 | * self.currentToken | ||
25 | Holds the token that is currently being processed. | ||
26 | |||
27 | * self.state | ||
28 | Holds a reference to the method to be invoked... XXX | ||
29 | |||
30 | * self.stream | ||
31 | Points to HTMLInputStream object. | ||
32 | """ | ||
33 | |||
34 | def __init__(self, stream, parser=None, **kwargs): | ||
35 | |||
36 | self.stream = HTMLInputStream(stream, **kwargs) | ||
37 | self.parser = parser | ||
38 | |||
39 | # Setup the initial tokenizer state | ||
40 | self.escapeFlag = False | ||
41 | self.lastFourChars = [] | ||
42 | self.state = self.dataState | ||
43 | self.escape = False | ||
44 | |||
45 | # The current token being created | ||
46 | self.currentToken = None | ||
47 | super(HTMLTokenizer, self).__init__() | ||
48 | |||
49 | def __iter__(self): | ||
50 | """ This is where the magic happens. | ||
51 | |||
52 | We do our usually processing through the states and when we have a token | ||
53 | to return we yield the token which pauses processing until the next token | ||
54 | is requested. | ||
55 | """ | ||
56 | self.tokenQueue = deque([]) | ||
57 | # Start processing. When EOF is reached self.state will return False | ||
58 | # instead of True and the loop will terminate. | ||
59 | while self.state(): | ||
60 | while self.stream.errors: | ||
61 | yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} | ||
62 | while self.tokenQueue: | ||
63 | yield self.tokenQueue.popleft() | ||
64 | |||
65 | def consumeNumberEntity(self, isHex): | ||
66 | """This function returns either U+FFFD or the character based on the | ||
67 | decimal or hexadecimal representation. It also discards ";" if present. | ||
68 | If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. | ||
69 | """ | ||
70 | |||
71 | allowed = digits | ||
72 | radix = 10 | ||
73 | if isHex: | ||
74 | allowed = hexDigits | ||
75 | radix = 16 | ||
76 | |||
77 | charStack = [] | ||
78 | |||
79 | # Consume all the characters that are in range while making sure we | ||
80 | # don't hit an EOF. | ||
81 | c = self.stream.char() | ||
82 | while c in allowed and c is not EOF: | ||
83 | charStack.append(c) | ||
84 | c = self.stream.char() | ||
85 | |||
86 | # Convert the set of characters consumed to an int. | ||
87 | charAsInt = int("".join(charStack), radix) | ||
88 | |||
89 | # Certain characters get replaced with others | ||
90 | if charAsInt in replacementCharacters: | ||
91 | char = replacementCharacters[charAsInt] | ||
92 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
93 | "illegal-codepoint-for-numeric-entity", | ||
94 | "datavars": {"charAsInt": charAsInt}}) | ||
95 | elif ((0xD800 <= charAsInt <= 0xDFFF) or | ||
96 | (charAsInt > 0x10FFFF)): | ||
97 | char = "\uFFFD" | ||
98 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
99 | "illegal-codepoint-for-numeric-entity", | ||
100 | "datavars": {"charAsInt": charAsInt}}) | ||
101 | else: | ||
102 | # Should speed up this check somehow (e.g. move the set to a constant) | ||
103 | if ((0x0001 <= charAsInt <= 0x0008) or | ||
104 | (0x000E <= charAsInt <= 0x001F) or | ||
105 | (0x007F <= charAsInt <= 0x009F) or | ||
106 | (0xFDD0 <= charAsInt <= 0xFDEF) or | ||
107 | charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, | ||
108 | 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, | ||
109 | 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, | ||
110 | 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, | ||
111 | 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, | ||
112 | 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, | ||
113 | 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, | ||
114 | 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, | ||
115 | 0xFFFFF, 0x10FFFE, 0x10FFFF])): | ||
116 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
117 | "data": | ||
118 | "illegal-codepoint-for-numeric-entity", | ||
119 | "datavars": {"charAsInt": charAsInt}}) | ||
120 | try: | ||
121 | # Try/except needed as UCS-2 Python builds' unichar only works | ||
122 | # within the BMP. | ||
123 | char = chr(charAsInt) | ||
124 | except ValueError: | ||
125 | v = charAsInt - 0x10000 | ||
126 | char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) | ||
127 | |||
128 | # Discard the ; if present. Otherwise, put it back on the queue and | ||
129 | # invoke parseError on parser. | ||
130 | if c != ";": | ||
131 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
132 | "numeric-entity-without-semicolon"}) | ||
133 | self.stream.unget(c) | ||
134 | |||
135 | return char | ||
136 | |||
137 | def consumeEntity(self, allowedChar=None, fromAttribute=False): | ||
138 | # Initialise to the default output for when no entity is matched | ||
139 | output = "&" | ||
140 | |||
141 | charStack = [self.stream.char()] | ||
142 | if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or | ||
143 | (allowedChar is not None and allowedChar == charStack[0])): | ||
144 | self.stream.unget(charStack[0]) | ||
145 | |||
146 | elif charStack[0] == "#": | ||
147 | # Read the next character to see if it's hex or decimal | ||
148 | hex = False | ||
149 | charStack.append(self.stream.char()) | ||
150 | if charStack[-1] in ("x", "X"): | ||
151 | hex = True | ||
152 | charStack.append(self.stream.char()) | ||
153 | |||
154 | # charStack[-1] should be the first digit | ||
155 | if (hex and charStack[-1] in hexDigits) \ | ||
156 | or (not hex and charStack[-1] in digits): | ||
157 | # At least one digit found, so consume the whole number | ||
158 | self.stream.unget(charStack[-1]) | ||
159 | output = self.consumeNumberEntity(hex) | ||
160 | else: | ||
161 | # No digits found | ||
162 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
163 | "data": "expected-numeric-entity"}) | ||
164 | self.stream.unget(charStack.pop()) | ||
165 | output = "&" + "".join(charStack) | ||
166 | |||
167 | else: | ||
168 | # At this point in the process might have named entity. Entities | ||
169 | # are stored in the global variable "entities". | ||
170 | # | ||
171 | # Consume characters and compare to these to a substring of the | ||
172 | # entity names in the list until the substring no longer matches. | ||
173 | while (charStack[-1] is not EOF): | ||
174 | if not entitiesTrie.has_keys_with_prefix("".join(charStack)): | ||
175 | break | ||
176 | charStack.append(self.stream.char()) | ||
177 | |||
178 | # At this point we have a string that starts with some characters | ||
179 | # that may match an entity | ||
180 | # Try to find the longest entity the string will match to take care | ||
181 | # of ¬i for instance. | ||
182 | try: | ||
183 | entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) | ||
184 | entityLength = len(entityName) | ||
185 | except KeyError: | ||
186 | entityName = None | ||
187 | |||
188 | if entityName is not None: | ||
189 | if entityName[-1] != ";": | ||
190 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
191 | "named-entity-without-semicolon"}) | ||
192 | if (entityName[-1] != ";" and fromAttribute and | ||
193 | (charStack[entityLength] in asciiLetters or | ||
194 | charStack[entityLength] in digits or | ||
195 | charStack[entityLength] == "=")): | ||
196 | self.stream.unget(charStack.pop()) | ||
197 | output = "&" + "".join(charStack) | ||
198 | else: | ||
199 | output = entities[entityName] | ||
200 | self.stream.unget(charStack.pop()) | ||
201 | output += "".join(charStack[entityLength:]) | ||
202 | else: | ||
203 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
204 | "expected-named-entity"}) | ||
205 | self.stream.unget(charStack.pop()) | ||
206 | output = "&" + "".join(charStack) | ||
207 | |||
208 | if fromAttribute: | ||
209 | self.currentToken["data"][-1][1] += output | ||
210 | else: | ||
211 | if output in spaceCharacters: | ||
212 | tokenType = "SpaceCharacters" | ||
213 | else: | ||
214 | tokenType = "Characters" | ||
215 | self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) | ||
216 | |||
217 | def processEntityInAttribute(self, allowedChar): | ||
218 | """This method replaces the need for "entityInAttributeValueState". | ||
219 | """ | ||
220 | self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) | ||
221 | |||
222 | def emitCurrentToken(self): | ||
223 | """This method is a generic handler for emitting the tags. It also sets | ||
224 | the state to "data" because that's what's needed after a token has been | ||
225 | emitted. | ||
226 | """ | ||
227 | token = self.currentToken | ||
228 | # Add token to the queue to be yielded | ||
229 | if (token["type"] in tagTokenTypes): | ||
230 | token["name"] = token["name"].translate(asciiUpper2Lower) | ||
231 | if token["type"] == tokenTypes["EndTag"]: | ||
232 | if token["data"]: | ||
233 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
234 | "data": "attributes-in-end-tag"}) | ||
235 | if token["selfClosing"]: | ||
236 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
237 | "data": "self-closing-flag-on-end-tag"}) | ||
238 | self.tokenQueue.append(token) | ||
239 | self.state = self.dataState | ||
240 | |||
241 | # Below are the various tokenizer states worked out. | ||
242 | def dataState(self): | ||
243 | data = self.stream.char() | ||
244 | if data == "&": | ||
245 | self.state = self.entityDataState | ||
246 | elif data == "<": | ||
247 | self.state = self.tagOpenState | ||
248 | elif data == "\u0000": | ||
249 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
250 | "data": "invalid-codepoint"}) | ||
251 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
252 | "data": "\u0000"}) | ||
253 | elif data is EOF: | ||
254 | # Tokenization ends. | ||
255 | return False | ||
256 | elif data in spaceCharacters: | ||
257 | # Directly after emitting a token you switch back to the "data | ||
258 | # state". At that point spaceCharacters are important so they are | ||
259 | # emitted separately. | ||
260 | self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": | ||
261 | data + self.stream.charsUntil(spaceCharacters, True)}) | ||
262 | # No need to update lastFourChars here, since the first space will | ||
263 | # have already been appended to lastFourChars and will have broken | ||
264 | # any <!-- or --> sequences | ||
265 | else: | ||
266 | chars = self.stream.charsUntil(("&", "<", "\u0000")) | ||
267 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
268 | data + chars}) | ||
269 | return True | ||
270 | |||
271 | def entityDataState(self): | ||
272 | self.consumeEntity() | ||
273 | self.state = self.dataState | ||
274 | return True | ||
275 | |||
276 | def rcdataState(self): | ||
277 | data = self.stream.char() | ||
278 | if data == "&": | ||
279 | self.state = self.characterReferenceInRcdata | ||
280 | elif data == "<": | ||
281 | self.state = self.rcdataLessThanSignState | ||
282 | elif data == EOF: | ||
283 | # Tokenization ends. | ||
284 | return False | ||
285 | elif data == "\u0000": | ||
286 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
287 | "data": "invalid-codepoint"}) | ||
288 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
289 | "data": "\uFFFD"}) | ||
290 | elif data in spaceCharacters: | ||
291 | # Directly after emitting a token you switch back to the "data | ||
292 | # state". At that point spaceCharacters are important so they are | ||
293 | # emitted separately. | ||
294 | self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": | ||
295 | data + self.stream.charsUntil(spaceCharacters, True)}) | ||
296 | # No need to update lastFourChars here, since the first space will | ||
297 | # have already been appended to lastFourChars and will have broken | ||
298 | # any <!-- or --> sequences | ||
299 | else: | ||
300 | chars = self.stream.charsUntil(("&", "<", "\u0000")) | ||
301 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
302 | data + chars}) | ||
303 | return True | ||
304 | |||
305 | def characterReferenceInRcdata(self): | ||
306 | self.consumeEntity() | ||
307 | self.state = self.rcdataState | ||
308 | return True | ||
309 | |||
310 | def rawtextState(self): | ||
311 | data = self.stream.char() | ||
312 | if data == "<": | ||
313 | self.state = self.rawtextLessThanSignState | ||
314 | elif data == "\u0000": | ||
315 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
316 | "data": "invalid-codepoint"}) | ||
317 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
318 | "data": "\uFFFD"}) | ||
319 | elif data == EOF: | ||
320 | # Tokenization ends. | ||
321 | return False | ||
322 | else: | ||
323 | chars = self.stream.charsUntil(("<", "\u0000")) | ||
324 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
325 | data + chars}) | ||
326 | return True | ||
327 | |||
328 | def scriptDataState(self): | ||
329 | data = self.stream.char() | ||
330 | if data == "<": | ||
331 | self.state = self.scriptDataLessThanSignState | ||
332 | elif data == "\u0000": | ||
333 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
334 | "data": "invalid-codepoint"}) | ||
335 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
336 | "data": "\uFFFD"}) | ||
337 | elif data == EOF: | ||
338 | # Tokenization ends. | ||
339 | return False | ||
340 | else: | ||
341 | chars = self.stream.charsUntil(("<", "\u0000")) | ||
342 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
343 | data + chars}) | ||
344 | return True | ||
345 | |||
346 | def plaintextState(self): | ||
347 | data = self.stream.char() | ||
348 | if data == EOF: | ||
349 | # Tokenization ends. | ||
350 | return False | ||
351 | elif data == "\u0000": | ||
352 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
353 | "data": "invalid-codepoint"}) | ||
354 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
355 | "data": "\uFFFD"}) | ||
356 | else: | ||
357 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
358 | data + self.stream.charsUntil("\u0000")}) | ||
359 | return True | ||
360 | |||
361 | def tagOpenState(self): | ||
362 | data = self.stream.char() | ||
363 | if data == "!": | ||
364 | self.state = self.markupDeclarationOpenState | ||
365 | elif data == "/": | ||
366 | self.state = self.closeTagOpenState | ||
367 | elif data in asciiLetters: | ||
368 | self.currentToken = {"type": tokenTypes["StartTag"], | ||
369 | "name": data, "data": [], | ||
370 | "selfClosing": False, | ||
371 | "selfClosingAcknowledged": False} | ||
372 | self.state = self.tagNameState | ||
373 | elif data == ">": | ||
374 | # XXX In theory it could be something besides a tag name. But | ||
375 | # do we really care? | ||
376 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
377 | "expected-tag-name-but-got-right-bracket"}) | ||
378 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) | ||
379 | self.state = self.dataState | ||
380 | elif data == "?": | ||
381 | # XXX In theory it could be something besides a tag name. But | ||
382 | # do we really care? | ||
383 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
384 | "expected-tag-name-but-got-question-mark"}) | ||
385 | self.stream.unget(data) | ||
386 | self.state = self.bogusCommentState | ||
387 | else: | ||
388 | # XXX | ||
389 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
390 | "expected-tag-name"}) | ||
391 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
392 | self.stream.unget(data) | ||
393 | self.state = self.dataState | ||
394 | return True | ||
395 | |||
396 | def closeTagOpenState(self): | ||
397 | data = self.stream.char() | ||
398 | if data in asciiLetters: | ||
399 | self.currentToken = {"type": tokenTypes["EndTag"], "name": data, | ||
400 | "data": [], "selfClosing": False} | ||
401 | self.state = self.tagNameState | ||
402 | elif data == ">": | ||
403 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
404 | "expected-closing-tag-but-got-right-bracket"}) | ||
405 | self.state = self.dataState | ||
406 | elif data is EOF: | ||
407 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
408 | "expected-closing-tag-but-got-eof"}) | ||
409 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | ||
410 | self.state = self.dataState | ||
411 | else: | ||
412 | # XXX data can be _'_... | ||
413 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
414 | "expected-closing-tag-but-got-char", | ||
415 | "datavars": {"data": data}}) | ||
416 | self.stream.unget(data) | ||
417 | self.state = self.bogusCommentState | ||
418 | return True | ||
419 | |||
420 | def tagNameState(self): | ||
421 | data = self.stream.char() | ||
422 | if data in spaceCharacters: | ||
423 | self.state = self.beforeAttributeNameState | ||
424 | elif data == ">": | ||
425 | self.emitCurrentToken() | ||
426 | elif data is EOF: | ||
427 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
428 | "eof-in-tag-name"}) | ||
429 | self.state = self.dataState | ||
430 | elif data == "/": | ||
431 | self.state = self.selfClosingStartTagState | ||
432 | elif data == "\u0000": | ||
433 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
434 | "data": "invalid-codepoint"}) | ||
435 | self.currentToken["name"] += "\uFFFD" | ||
436 | else: | ||
437 | self.currentToken["name"] += data | ||
438 | # (Don't use charsUntil here, because tag names are | ||
439 | # very short and it's faster to not do anything fancy) | ||
440 | return True | ||
441 | |||
442 | def rcdataLessThanSignState(self): | ||
443 | data = self.stream.char() | ||
444 | if data == "/": | ||
445 | self.temporaryBuffer = "" | ||
446 | self.state = self.rcdataEndTagOpenState | ||
447 | else: | ||
448 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
449 | self.stream.unget(data) | ||
450 | self.state = self.rcdataState | ||
451 | return True | ||
452 | |||
453 | def rcdataEndTagOpenState(self): | ||
454 | data = self.stream.char() | ||
455 | if data in asciiLetters: | ||
456 | self.temporaryBuffer += data | ||
457 | self.state = self.rcdataEndTagNameState | ||
458 | else: | ||
459 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | ||
460 | self.stream.unget(data) | ||
461 | self.state = self.rcdataState | ||
462 | return True | ||
463 | |||
464 | def rcdataEndTagNameState(self): | ||
465 | appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
466 | data = self.stream.char() | ||
467 | if data in spaceCharacters and appropriate: | ||
468 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
469 | "name": self.temporaryBuffer, | ||
470 | "data": [], "selfClosing": False} | ||
471 | self.state = self.beforeAttributeNameState | ||
472 | elif data == "/" and appropriate: | ||
473 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
474 | "name": self.temporaryBuffer, | ||
475 | "data": [], "selfClosing": False} | ||
476 | self.state = self.selfClosingStartTagState | ||
477 | elif data == ">" and appropriate: | ||
478 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
479 | "name": self.temporaryBuffer, | ||
480 | "data": [], "selfClosing": False} | ||
481 | self.emitCurrentToken() | ||
482 | self.state = self.dataState | ||
483 | elif data in asciiLetters: | ||
484 | self.temporaryBuffer += data | ||
485 | else: | ||
486 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
487 | "data": "</" + self.temporaryBuffer}) | ||
488 | self.stream.unget(data) | ||
489 | self.state = self.rcdataState | ||
490 | return True | ||
491 | |||
492 | def rawtextLessThanSignState(self): | ||
493 | data = self.stream.char() | ||
494 | if data == "/": | ||
495 | self.temporaryBuffer = "" | ||
496 | self.state = self.rawtextEndTagOpenState | ||
497 | else: | ||
498 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
499 | self.stream.unget(data) | ||
500 | self.state = self.rawtextState | ||
501 | return True | ||
502 | |||
503 | def rawtextEndTagOpenState(self): | ||
504 | data = self.stream.char() | ||
505 | if data in asciiLetters: | ||
506 | self.temporaryBuffer += data | ||
507 | self.state = self.rawtextEndTagNameState | ||
508 | else: | ||
509 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | ||
510 | self.stream.unget(data) | ||
511 | self.state = self.rawtextState | ||
512 | return True | ||
513 | |||
514 | def rawtextEndTagNameState(self): | ||
515 | appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
516 | data = self.stream.char() | ||
517 | if data in spaceCharacters and appropriate: | ||
518 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
519 | "name": self.temporaryBuffer, | ||
520 | "data": [], "selfClosing": False} | ||
521 | self.state = self.beforeAttributeNameState | ||
522 | elif data == "/" and appropriate: | ||
523 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
524 | "name": self.temporaryBuffer, | ||
525 | "data": [], "selfClosing": False} | ||
526 | self.state = self.selfClosingStartTagState | ||
527 | elif data == ">" and appropriate: | ||
528 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
529 | "name": self.temporaryBuffer, | ||
530 | "data": [], "selfClosing": False} | ||
531 | self.emitCurrentToken() | ||
532 | self.state = self.dataState | ||
533 | elif data in asciiLetters: | ||
534 | self.temporaryBuffer += data | ||
535 | else: | ||
536 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
537 | "data": "</" + self.temporaryBuffer}) | ||
538 | self.stream.unget(data) | ||
539 | self.state = self.rawtextState | ||
540 | return True | ||
541 | |||
542 | def scriptDataLessThanSignState(self): | ||
543 | data = self.stream.char() | ||
544 | if data == "/": | ||
545 | self.temporaryBuffer = "" | ||
546 | self.state = self.scriptDataEndTagOpenState | ||
547 | elif data == "!": | ||
548 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) | ||
549 | self.state = self.scriptDataEscapeStartState | ||
550 | else: | ||
551 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
552 | self.stream.unget(data) | ||
553 | self.state = self.scriptDataState | ||
554 | return True | ||
555 | |||
556 | def scriptDataEndTagOpenState(self): | ||
557 | data = self.stream.char() | ||
558 | if data in asciiLetters: | ||
559 | self.temporaryBuffer += data | ||
560 | self.state = self.scriptDataEndTagNameState | ||
561 | else: | ||
562 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | ||
563 | self.stream.unget(data) | ||
564 | self.state = self.scriptDataState | ||
565 | return True | ||
566 | |||
567 | def scriptDataEndTagNameState(self): | ||
568 | appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
569 | data = self.stream.char() | ||
570 | if data in spaceCharacters and appropriate: | ||
571 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
572 | "name": self.temporaryBuffer, | ||
573 | "data": [], "selfClosing": False} | ||
574 | self.state = self.beforeAttributeNameState | ||
575 | elif data == "/" and appropriate: | ||
576 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
577 | "name": self.temporaryBuffer, | ||
578 | "data": [], "selfClosing": False} | ||
579 | self.state = self.selfClosingStartTagState | ||
580 | elif data == ">" and appropriate: | ||
581 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
582 | "name": self.temporaryBuffer, | ||
583 | "data": [], "selfClosing": False} | ||
584 | self.emitCurrentToken() | ||
585 | self.state = self.dataState | ||
586 | elif data in asciiLetters: | ||
587 | self.temporaryBuffer += data | ||
588 | else: | ||
589 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
590 | "data": "</" + self.temporaryBuffer}) | ||
591 | self.stream.unget(data) | ||
592 | self.state = self.scriptDataState | ||
593 | return True | ||
594 | |||
595 | def scriptDataEscapeStartState(self): | ||
596 | data = self.stream.char() | ||
597 | if data == "-": | ||
598 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
599 | self.state = self.scriptDataEscapeStartDashState | ||
600 | else: | ||
601 | self.stream.unget(data) | ||
602 | self.state = self.scriptDataState | ||
603 | return True | ||
604 | |||
605 | def scriptDataEscapeStartDashState(self): | ||
606 | data = self.stream.char() | ||
607 | if data == "-": | ||
608 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
609 | self.state = self.scriptDataEscapedDashDashState | ||
610 | else: | ||
611 | self.stream.unget(data) | ||
612 | self.state = self.scriptDataState | ||
613 | return True | ||
614 | |||
615 | def scriptDataEscapedState(self): | ||
616 | data = self.stream.char() | ||
617 | if data == "-": | ||
618 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
619 | self.state = self.scriptDataEscapedDashState | ||
620 | elif data == "<": | ||
621 | self.state = self.scriptDataEscapedLessThanSignState | ||
622 | elif data == "\u0000": | ||
623 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
624 | "data": "invalid-codepoint"}) | ||
625 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
626 | "data": "\uFFFD"}) | ||
627 | elif data == EOF: | ||
628 | self.state = self.dataState | ||
629 | else: | ||
630 | chars = self.stream.charsUntil(("<", "-", "\u0000")) | ||
631 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
632 | data + chars}) | ||
633 | return True | ||
634 | |||
635 | def scriptDataEscapedDashState(self): | ||
636 | data = self.stream.char() | ||
637 | if data == "-": | ||
638 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
639 | self.state = self.scriptDataEscapedDashDashState | ||
640 | elif data == "<": | ||
641 | self.state = self.scriptDataEscapedLessThanSignState | ||
642 | elif data == "\u0000": | ||
643 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
644 | "data": "invalid-codepoint"}) | ||
645 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
646 | "data": "\uFFFD"}) | ||
647 | self.state = self.scriptDataEscapedState | ||
648 | elif data == EOF: | ||
649 | self.state = self.dataState | ||
650 | else: | ||
651 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
652 | self.state = self.scriptDataEscapedState | ||
653 | return True | ||
654 | |||
655 | def scriptDataEscapedDashDashState(self): | ||
656 | data = self.stream.char() | ||
657 | if data == "-": | ||
658 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
659 | elif data == "<": | ||
660 | self.state = self.scriptDataEscapedLessThanSignState | ||
661 | elif data == ">": | ||
662 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) | ||
663 | self.state = self.scriptDataState | ||
664 | elif data == "\u0000": | ||
665 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
666 | "data": "invalid-codepoint"}) | ||
667 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
668 | "data": "\uFFFD"}) | ||
669 | self.state = self.scriptDataEscapedState | ||
670 | elif data == EOF: | ||
671 | self.state = self.dataState | ||
672 | else: | ||
673 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
674 | self.state = self.scriptDataEscapedState | ||
675 | return True | ||
676 | |||
677 | def scriptDataEscapedLessThanSignState(self): | ||
678 | data = self.stream.char() | ||
679 | if data == "/": | ||
680 | self.temporaryBuffer = "" | ||
681 | self.state = self.scriptDataEscapedEndTagOpenState | ||
682 | elif data in asciiLetters: | ||
683 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) | ||
684 | self.temporaryBuffer = data | ||
685 | self.state = self.scriptDataDoubleEscapeStartState | ||
686 | else: | ||
687 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
688 | self.stream.unget(data) | ||
689 | self.state = self.scriptDataEscapedState | ||
690 | return True | ||
691 | |||
692 | def scriptDataEscapedEndTagOpenState(self): | ||
693 | data = self.stream.char() | ||
694 | if data in asciiLetters: | ||
695 | self.temporaryBuffer = data | ||
696 | self.state = self.scriptDataEscapedEndTagNameState | ||
697 | else: | ||
698 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) | ||
699 | self.stream.unget(data) | ||
700 | self.state = self.scriptDataEscapedState | ||
701 | return True | ||
702 | |||
703 | def scriptDataEscapedEndTagNameState(self): | ||
704 | appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
705 | data = self.stream.char() | ||
706 | if data in spaceCharacters and appropriate: | ||
707 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
708 | "name": self.temporaryBuffer, | ||
709 | "data": [], "selfClosing": False} | ||
710 | self.state = self.beforeAttributeNameState | ||
711 | elif data == "/" and appropriate: | ||
712 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
713 | "name": self.temporaryBuffer, | ||
714 | "data": [], "selfClosing": False} | ||
715 | self.state = self.selfClosingStartTagState | ||
716 | elif data == ">" and appropriate: | ||
717 | self.currentToken = {"type": tokenTypes["EndTag"], | ||
718 | "name": self.temporaryBuffer, | ||
719 | "data": [], "selfClosing": False} | ||
720 | self.emitCurrentToken() | ||
721 | self.state = self.dataState | ||
722 | elif data in asciiLetters: | ||
723 | self.temporaryBuffer += data | ||
724 | else: | ||
725 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
726 | "data": "</" + self.temporaryBuffer}) | ||
727 | self.stream.unget(data) | ||
728 | self.state = self.scriptDataEscapedState | ||
729 | return True | ||
730 | |||
731 | def scriptDataDoubleEscapeStartState(self): | ||
732 | data = self.stream.char() | ||
733 | if data in (spaceCharacters | frozenset(("/", ">"))): | ||
734 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
735 | if self.temporaryBuffer.lower() == "script": | ||
736 | self.state = self.scriptDataDoubleEscapedState | ||
737 | else: | ||
738 | self.state = self.scriptDataEscapedState | ||
739 | elif data in asciiLetters: | ||
740 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
741 | self.temporaryBuffer += data | ||
742 | else: | ||
743 | self.stream.unget(data) | ||
744 | self.state = self.scriptDataEscapedState | ||
745 | return True | ||
746 | |||
747 | def scriptDataDoubleEscapedState(self): | ||
748 | data = self.stream.char() | ||
749 | if data == "-": | ||
750 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
751 | self.state = self.scriptDataDoubleEscapedDashState | ||
752 | elif data == "<": | ||
753 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
754 | self.state = self.scriptDataDoubleEscapedLessThanSignState | ||
755 | elif data == "\u0000": | ||
756 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
757 | "data": "invalid-codepoint"}) | ||
758 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
759 | "data": "\uFFFD"}) | ||
760 | elif data == EOF: | ||
761 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
762 | "eof-in-script-in-script"}) | ||
763 | self.state = self.dataState | ||
764 | else: | ||
765 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
766 | return True | ||
767 | |||
768 | def scriptDataDoubleEscapedDashState(self): | ||
769 | data = self.stream.char() | ||
770 | if data == "-": | ||
771 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
772 | self.state = self.scriptDataDoubleEscapedDashDashState | ||
773 | elif data == "<": | ||
774 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
775 | self.state = self.scriptDataDoubleEscapedLessThanSignState | ||
776 | elif data == "\u0000": | ||
777 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
778 | "data": "invalid-codepoint"}) | ||
779 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
780 | "data": "\uFFFD"}) | ||
781 | self.state = self.scriptDataDoubleEscapedState | ||
782 | elif data == EOF: | ||
783 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
784 | "eof-in-script-in-script"}) | ||
785 | self.state = self.dataState | ||
786 | else: | ||
787 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
788 | self.state = self.scriptDataDoubleEscapedState | ||
789 | return True | ||
790 | |||
791 | def scriptDataDoubleEscapedDashDashState(self): | ||
792 | data = self.stream.char() | ||
793 | if data == "-": | ||
794 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) | ||
795 | elif data == "<": | ||
796 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) | ||
797 | self.state = self.scriptDataDoubleEscapedLessThanSignState | ||
798 | elif data == ">": | ||
799 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) | ||
800 | self.state = self.scriptDataState | ||
801 | elif data == "\u0000": | ||
802 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
803 | "data": "invalid-codepoint"}) | ||
804 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
805 | "data": "\uFFFD"}) | ||
806 | self.state = self.scriptDataDoubleEscapedState | ||
807 | elif data == EOF: | ||
808 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
809 | "eof-in-script-in-script"}) | ||
810 | self.state = self.dataState | ||
811 | else: | ||
812 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
813 | self.state = self.scriptDataDoubleEscapedState | ||
814 | return True | ||
815 | |||
816 | def scriptDataDoubleEscapedLessThanSignState(self): | ||
817 | data = self.stream.char() | ||
818 | if data == "/": | ||
819 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) | ||
820 | self.temporaryBuffer = "" | ||
821 | self.state = self.scriptDataDoubleEscapeEndState | ||
822 | else: | ||
823 | self.stream.unget(data) | ||
824 | self.state = self.scriptDataDoubleEscapedState | ||
825 | return True | ||
826 | |||
827 | def scriptDataDoubleEscapeEndState(self): | ||
828 | data = self.stream.char() | ||
829 | if data in (spaceCharacters | frozenset(("/", ">"))): | ||
830 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
831 | if self.temporaryBuffer.lower() == "script": | ||
832 | self.state = self.scriptDataEscapedState | ||
833 | else: | ||
834 | self.state = self.scriptDataDoubleEscapedState | ||
835 | elif data in asciiLetters: | ||
836 | self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
837 | self.temporaryBuffer += data | ||
838 | else: | ||
839 | self.stream.unget(data) | ||
840 | self.state = self.scriptDataDoubleEscapedState | ||
841 | return True | ||
842 | |||
843 | def beforeAttributeNameState(self): | ||
844 | data = self.stream.char() | ||
845 | if data in spaceCharacters: | ||
846 | self.stream.charsUntil(spaceCharacters, True) | ||
847 | elif data in asciiLetters: | ||
848 | self.currentToken["data"].append([data, ""]) | ||
849 | self.state = self.attributeNameState | ||
850 | elif data == ">": | ||
851 | self.emitCurrentToken() | ||
852 | elif data == "/": | ||
853 | self.state = self.selfClosingStartTagState | ||
854 | elif data in ("'", '"', "=", "<"): | ||
855 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
856 | "invalid-character-in-attribute-name"}) | ||
857 | self.currentToken["data"].append([data, ""]) | ||
858 | self.state = self.attributeNameState | ||
859 | elif data == "\u0000": | ||
860 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
861 | "data": "invalid-codepoint"}) | ||
862 | self.currentToken["data"].append(["\uFFFD", ""]) | ||
863 | self.state = self.attributeNameState | ||
864 | elif data is EOF: | ||
865 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
866 | "expected-attribute-name-but-got-eof"}) | ||
867 | self.state = self.dataState | ||
868 | else: | ||
869 | self.currentToken["data"].append([data, ""]) | ||
870 | self.state = self.attributeNameState | ||
871 | return True | ||
872 | |||
873 | def attributeNameState(self): | ||
874 | data = self.stream.char() | ||
875 | leavingThisState = True | ||
876 | emitToken = False | ||
877 | if data == "=": | ||
878 | self.state = self.beforeAttributeValueState | ||
879 | elif data in asciiLetters: | ||
880 | self.currentToken["data"][-1][0] += data +\ | ||
881 | self.stream.charsUntil(asciiLetters, True) | ||
882 | leavingThisState = False | ||
883 | elif data == ">": | ||
884 | # XXX If we emit here the attributes are converted to a dict | ||
885 | # without being checked and when the code below runs we error | ||
886 | # because data is a dict not a list | ||
887 | emitToken = True | ||
888 | elif data in spaceCharacters: | ||
889 | self.state = self.afterAttributeNameState | ||
890 | elif data == "/": | ||
891 | self.state = self.selfClosingStartTagState | ||
892 | elif data == "\u0000": | ||
893 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
894 | "data": "invalid-codepoint"}) | ||
895 | self.currentToken["data"][-1][0] += "\uFFFD" | ||
896 | leavingThisState = False | ||
897 | elif data in ("'", '"', "<"): | ||
898 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
899 | "data": | ||
900 | "invalid-character-in-attribute-name"}) | ||
901 | self.currentToken["data"][-1][0] += data | ||
902 | leavingThisState = False | ||
903 | elif data is EOF: | ||
904 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
905 | "data": "eof-in-attribute-name"}) | ||
906 | self.state = self.dataState | ||
907 | else: | ||
908 | self.currentToken["data"][-1][0] += data | ||
909 | leavingThisState = False | ||
910 | |||
911 | if leavingThisState: | ||
912 | # Attributes are not dropped at this stage. That happens when the | ||
913 | # start tag token is emitted so values can still be safely appended | ||
914 | # to attributes, but we do want to report the parse error in time. | ||
915 | self.currentToken["data"][-1][0] = ( | ||
916 | self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) | ||
917 | for name, _ in self.currentToken["data"][:-1]: | ||
918 | if self.currentToken["data"][-1][0] == name: | ||
919 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
920 | "duplicate-attribute"}) | ||
921 | break | ||
922 | # XXX Fix for above XXX | ||
923 | if emitToken: | ||
924 | self.emitCurrentToken() | ||
925 | return True | ||
926 | |||
927 | def afterAttributeNameState(self): | ||
928 | data = self.stream.char() | ||
929 | if data in spaceCharacters: | ||
930 | self.stream.charsUntil(spaceCharacters, True) | ||
931 | elif data == "=": | ||
932 | self.state = self.beforeAttributeValueState | ||
933 | elif data == ">": | ||
934 | self.emitCurrentToken() | ||
935 | elif data in asciiLetters: | ||
936 | self.currentToken["data"].append([data, ""]) | ||
937 | self.state = self.attributeNameState | ||
938 | elif data == "/": | ||
939 | self.state = self.selfClosingStartTagState | ||
940 | elif data == "\u0000": | ||
941 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
942 | "data": "invalid-codepoint"}) | ||
943 | self.currentToken["data"].append(["\uFFFD", ""]) | ||
944 | self.state = self.attributeNameState | ||
945 | elif data in ("'", '"', "<"): | ||
946 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
947 | "invalid-character-after-attribute-name"}) | ||
948 | self.currentToken["data"].append([data, ""]) | ||
949 | self.state = self.attributeNameState | ||
950 | elif data is EOF: | ||
951 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
952 | "expected-end-of-tag-but-got-eof"}) | ||
953 | self.state = self.dataState | ||
954 | else: | ||
955 | self.currentToken["data"].append([data, ""]) | ||
956 | self.state = self.attributeNameState | ||
957 | return True | ||
958 | |||
959 | def beforeAttributeValueState(self): | ||
960 | data = self.stream.char() | ||
961 | if data in spaceCharacters: | ||
962 | self.stream.charsUntil(spaceCharacters, True) | ||
963 | elif data == "\"": | ||
964 | self.state = self.attributeValueDoubleQuotedState | ||
965 | elif data == "&": | ||
966 | self.state = self.attributeValueUnQuotedState | ||
967 | self.stream.unget(data) | ||
968 | elif data == "'": | ||
969 | self.state = self.attributeValueSingleQuotedState | ||
970 | elif data == ">": | ||
971 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
972 | "expected-attribute-value-but-got-right-bracket"}) | ||
973 | self.emitCurrentToken() | ||
974 | elif data == "\u0000": | ||
975 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
976 | "data": "invalid-codepoint"}) | ||
977 | self.currentToken["data"][-1][1] += "\uFFFD" | ||
978 | self.state = self.attributeValueUnQuotedState | ||
979 | elif data in ("=", "<", "`"): | ||
980 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
981 | "equals-in-unquoted-attribute-value"}) | ||
982 | self.currentToken["data"][-1][1] += data | ||
983 | self.state = self.attributeValueUnQuotedState | ||
984 | elif data is EOF: | ||
985 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
986 | "expected-attribute-value-but-got-eof"}) | ||
987 | self.state = self.dataState | ||
988 | else: | ||
989 | self.currentToken["data"][-1][1] += data | ||
990 | self.state = self.attributeValueUnQuotedState | ||
991 | return True | ||
992 | |||
993 | def attributeValueDoubleQuotedState(self): | ||
994 | data = self.stream.char() | ||
995 | if data == "\"": | ||
996 | self.state = self.afterAttributeValueState | ||
997 | elif data == "&": | ||
998 | self.processEntityInAttribute('"') | ||
999 | elif data == "\u0000": | ||
1000 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1001 | "data": "invalid-codepoint"}) | ||
1002 | self.currentToken["data"][-1][1] += "\uFFFD" | ||
1003 | elif data is EOF: | ||
1004 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1005 | "eof-in-attribute-value-double-quote"}) | ||
1006 | self.state = self.dataState | ||
1007 | else: | ||
1008 | self.currentToken["data"][-1][1] += data +\ | ||
1009 | self.stream.charsUntil(("\"", "&", "\u0000")) | ||
1010 | return True | ||
1011 | |||
1012 | def attributeValueSingleQuotedState(self): | ||
1013 | data = self.stream.char() | ||
1014 | if data == "'": | ||
1015 | self.state = self.afterAttributeValueState | ||
1016 | elif data == "&": | ||
1017 | self.processEntityInAttribute("'") | ||
1018 | elif data == "\u0000": | ||
1019 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1020 | "data": "invalid-codepoint"}) | ||
1021 | self.currentToken["data"][-1][1] += "\uFFFD" | ||
1022 | elif data is EOF: | ||
1023 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1024 | "eof-in-attribute-value-single-quote"}) | ||
1025 | self.state = self.dataState | ||
1026 | else: | ||
1027 | self.currentToken["data"][-1][1] += data +\ | ||
1028 | self.stream.charsUntil(("'", "&", "\u0000")) | ||
1029 | return True | ||
1030 | |||
1031 | def attributeValueUnQuotedState(self): | ||
1032 | data = self.stream.char() | ||
1033 | if data in spaceCharacters: | ||
1034 | self.state = self.beforeAttributeNameState | ||
1035 | elif data == "&": | ||
1036 | self.processEntityInAttribute(">") | ||
1037 | elif data == ">": | ||
1038 | self.emitCurrentToken() | ||
1039 | elif data in ('"', "'", "=", "<", "`"): | ||
1040 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1041 | "unexpected-character-in-unquoted-attribute-value"}) | ||
1042 | self.currentToken["data"][-1][1] += data | ||
1043 | elif data == "\u0000": | ||
1044 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1045 | "data": "invalid-codepoint"}) | ||
1046 | self.currentToken["data"][-1][1] += "\uFFFD" | ||
1047 | elif data is EOF: | ||
1048 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1049 | "eof-in-attribute-value-no-quotes"}) | ||
1050 | self.state = self.dataState | ||
1051 | else: | ||
1052 | self.currentToken["data"][-1][1] += data + self.stream.charsUntil( | ||
1053 | frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) | ||
1054 | return True | ||
1055 | |||
1056 | def afterAttributeValueState(self): | ||
1057 | data = self.stream.char() | ||
1058 | if data in spaceCharacters: | ||
1059 | self.state = self.beforeAttributeNameState | ||
1060 | elif data == ">": | ||
1061 | self.emitCurrentToken() | ||
1062 | elif data == "/": | ||
1063 | self.state = self.selfClosingStartTagState | ||
1064 | elif data is EOF: | ||
1065 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1066 | "unexpected-EOF-after-attribute-value"}) | ||
1067 | self.stream.unget(data) | ||
1068 | self.state = self.dataState | ||
1069 | else: | ||
1070 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1071 | "unexpected-character-after-attribute-value"}) | ||
1072 | self.stream.unget(data) | ||
1073 | self.state = self.beforeAttributeNameState | ||
1074 | return True | ||
1075 | |||
1076 | def selfClosingStartTagState(self): | ||
1077 | data = self.stream.char() | ||
1078 | if data == ">": | ||
1079 | self.currentToken["selfClosing"] = True | ||
1080 | self.emitCurrentToken() | ||
1081 | elif data is EOF: | ||
1082 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1083 | "data": | ||
1084 | "unexpected-EOF-after-solidus-in-tag"}) | ||
1085 | self.stream.unget(data) | ||
1086 | self.state = self.dataState | ||
1087 | else: | ||
1088 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1089 | "unexpected-character-after-solidus-in-tag"}) | ||
1090 | self.stream.unget(data) | ||
1091 | self.state = self.beforeAttributeNameState | ||
1092 | return True | ||
1093 | |||
1094 | def bogusCommentState(self): | ||
1095 | # Make a new comment token and give it as value all the characters | ||
1096 | # until the first > or EOF (charsUntil checks for EOF automatically) | ||
1097 | # and emit it. | ||
1098 | data = self.stream.charsUntil(">") | ||
1099 | data = data.replace("\u0000", "\uFFFD") | ||
1100 | self.tokenQueue.append( | ||
1101 | {"type": tokenTypes["Comment"], "data": data}) | ||
1102 | |||
1103 | # Eat the character directly after the bogus comment which is either a | ||
1104 | # ">" or an EOF. | ||
1105 | self.stream.char() | ||
1106 | self.state = self.dataState | ||
1107 | return True | ||
1108 | |||
1109 | def markupDeclarationOpenState(self): | ||
1110 | charStack = [self.stream.char()] | ||
1111 | if charStack[-1] == "-": | ||
1112 | charStack.append(self.stream.char()) | ||
1113 | if charStack[-1] == "-": | ||
1114 | self.currentToken = {"type": tokenTypes["Comment"], "data": ""} | ||
1115 | self.state = self.commentStartState | ||
1116 | return True | ||
1117 | elif charStack[-1] in ('d', 'D'): | ||
1118 | matched = True | ||
1119 | for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), | ||
1120 | ('y', 'Y'), ('p', 'P'), ('e', 'E')): | ||
1121 | charStack.append(self.stream.char()) | ||
1122 | if charStack[-1] not in expected: | ||
1123 | matched = False | ||
1124 | break | ||
1125 | if matched: | ||
1126 | self.currentToken = {"type": tokenTypes["Doctype"], | ||
1127 | "name": "", | ||
1128 | "publicId": None, "systemId": None, | ||
1129 | "correct": True} | ||
1130 | self.state = self.doctypeState | ||
1131 | return True | ||
1132 | elif (charStack[-1] == "[" and | ||
1133 | self.parser is not None and | ||
1134 | self.parser.tree.openElements and | ||
1135 | self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): | ||
1136 | matched = True | ||
1137 | for expected in ["C", "D", "A", "T", "A", "["]: | ||
1138 | charStack.append(self.stream.char()) | ||
1139 | if charStack[-1] != expected: | ||
1140 | matched = False | ||
1141 | break | ||
1142 | if matched: | ||
1143 | self.state = self.cdataSectionState | ||
1144 | return True | ||
1145 | |||
1146 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1147 | "expected-dashes-or-doctype"}) | ||
1148 | |||
1149 | while charStack: | ||
1150 | self.stream.unget(charStack.pop()) | ||
1151 | self.state = self.bogusCommentState | ||
1152 | return True | ||
1153 | |||
1154 | def commentStartState(self): | ||
1155 | data = self.stream.char() | ||
1156 | if data == "-": | ||
1157 | self.state = self.commentStartDashState | ||
1158 | elif data == "\u0000": | ||
1159 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1160 | "data": "invalid-codepoint"}) | ||
1161 | self.currentToken["data"] += "\uFFFD" | ||
1162 | elif data == ">": | ||
1163 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1164 | "incorrect-comment"}) | ||
1165 | self.tokenQueue.append(self.currentToken) | ||
1166 | self.state = self.dataState | ||
1167 | elif data is EOF: | ||
1168 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1169 | "eof-in-comment"}) | ||
1170 | self.tokenQueue.append(self.currentToken) | ||
1171 | self.state = self.dataState | ||
1172 | else: | ||
1173 | self.currentToken["data"] += data | ||
1174 | self.state = self.commentState | ||
1175 | return True | ||
1176 | |||
1177 | def commentStartDashState(self): | ||
1178 | data = self.stream.char() | ||
1179 | if data == "-": | ||
1180 | self.state = self.commentEndState | ||
1181 | elif data == "\u0000": | ||
1182 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1183 | "data": "invalid-codepoint"}) | ||
1184 | self.currentToken["data"] += "-\uFFFD" | ||
1185 | elif data == ">": | ||
1186 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1187 | "incorrect-comment"}) | ||
1188 | self.tokenQueue.append(self.currentToken) | ||
1189 | self.state = self.dataState | ||
1190 | elif data is EOF: | ||
1191 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1192 | "eof-in-comment"}) | ||
1193 | self.tokenQueue.append(self.currentToken) | ||
1194 | self.state = self.dataState | ||
1195 | else: | ||
1196 | self.currentToken["data"] += "-" + data | ||
1197 | self.state = self.commentState | ||
1198 | return True | ||
1199 | |||
1200 | def commentState(self): | ||
1201 | data = self.stream.char() | ||
1202 | if data == "-": | ||
1203 | self.state = self.commentEndDashState | ||
1204 | elif data == "\u0000": | ||
1205 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1206 | "data": "invalid-codepoint"}) | ||
1207 | self.currentToken["data"] += "\uFFFD" | ||
1208 | elif data is EOF: | ||
1209 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1210 | "data": "eof-in-comment"}) | ||
1211 | self.tokenQueue.append(self.currentToken) | ||
1212 | self.state = self.dataState | ||
1213 | else: | ||
1214 | self.currentToken["data"] += data + \ | ||
1215 | self.stream.charsUntil(("-", "\u0000")) | ||
1216 | return True | ||
1217 | |||
1218 | def commentEndDashState(self): | ||
1219 | data = self.stream.char() | ||
1220 | if data == "-": | ||
1221 | self.state = self.commentEndState | ||
1222 | elif data == "\u0000": | ||
1223 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1224 | "data": "invalid-codepoint"}) | ||
1225 | self.currentToken["data"] += "-\uFFFD" | ||
1226 | self.state = self.commentState | ||
1227 | elif data is EOF: | ||
1228 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1229 | "eof-in-comment-end-dash"}) | ||
1230 | self.tokenQueue.append(self.currentToken) | ||
1231 | self.state = self.dataState | ||
1232 | else: | ||
1233 | self.currentToken["data"] += "-" + data | ||
1234 | self.state = self.commentState | ||
1235 | return True | ||
1236 | |||
1237 | def commentEndState(self): | ||
1238 | data = self.stream.char() | ||
1239 | if data == ">": | ||
1240 | self.tokenQueue.append(self.currentToken) | ||
1241 | self.state = self.dataState | ||
1242 | elif data == "\u0000": | ||
1243 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1244 | "data": "invalid-codepoint"}) | ||
1245 | self.currentToken["data"] += "--\uFFFD" | ||
1246 | self.state = self.commentState | ||
1247 | elif data == "!": | ||
1248 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1249 | "unexpected-bang-after-double-dash-in-comment"}) | ||
1250 | self.state = self.commentEndBangState | ||
1251 | elif data == "-": | ||
1252 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1253 | "unexpected-dash-after-double-dash-in-comment"}) | ||
1254 | self.currentToken["data"] += data | ||
1255 | elif data is EOF: | ||
1256 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1257 | "eof-in-comment-double-dash"}) | ||
1258 | self.tokenQueue.append(self.currentToken) | ||
1259 | self.state = self.dataState | ||
1260 | else: | ||
1261 | # XXX | ||
1262 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1263 | "unexpected-char-in-comment"}) | ||
1264 | self.currentToken["data"] += "--" + data | ||
1265 | self.state = self.commentState | ||
1266 | return True | ||
1267 | |||
1268 | def commentEndBangState(self): | ||
1269 | data = self.stream.char() | ||
1270 | if data == ">": | ||
1271 | self.tokenQueue.append(self.currentToken) | ||
1272 | self.state = self.dataState | ||
1273 | elif data == "-": | ||
1274 | self.currentToken["data"] += "--!" | ||
1275 | self.state = self.commentEndDashState | ||
1276 | elif data == "\u0000": | ||
1277 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1278 | "data": "invalid-codepoint"}) | ||
1279 | self.currentToken["data"] += "--!\uFFFD" | ||
1280 | self.state = self.commentState | ||
1281 | elif data is EOF: | ||
1282 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1283 | "eof-in-comment-end-bang-state"}) | ||
1284 | self.tokenQueue.append(self.currentToken) | ||
1285 | self.state = self.dataState | ||
1286 | else: | ||
1287 | self.currentToken["data"] += "--!" + data | ||
1288 | self.state = self.commentState | ||
1289 | return True | ||
1290 | |||
1291 | def doctypeState(self): | ||
1292 | data = self.stream.char() | ||
1293 | if data in spaceCharacters: | ||
1294 | self.state = self.beforeDoctypeNameState | ||
1295 | elif data is EOF: | ||
1296 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1297 | "expected-doctype-name-but-got-eof"}) | ||
1298 | self.currentToken["correct"] = False | ||
1299 | self.tokenQueue.append(self.currentToken) | ||
1300 | self.state = self.dataState | ||
1301 | else: | ||
1302 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1303 | "need-space-after-doctype"}) | ||
1304 | self.stream.unget(data) | ||
1305 | self.state = self.beforeDoctypeNameState | ||
1306 | return True | ||
1307 | |||
1308 | def beforeDoctypeNameState(self): | ||
1309 | data = self.stream.char() | ||
1310 | if data in spaceCharacters: | ||
1311 | pass | ||
1312 | elif data == ">": | ||
1313 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1314 | "expected-doctype-name-but-got-right-bracket"}) | ||
1315 | self.currentToken["correct"] = False | ||
1316 | self.tokenQueue.append(self.currentToken) | ||
1317 | self.state = self.dataState | ||
1318 | elif data == "\u0000": | ||
1319 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1320 | "data": "invalid-codepoint"}) | ||
1321 | self.currentToken["name"] = "\uFFFD" | ||
1322 | self.state = self.doctypeNameState | ||
1323 | elif data is EOF: | ||
1324 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1325 | "expected-doctype-name-but-got-eof"}) | ||
1326 | self.currentToken["correct"] = False | ||
1327 | self.tokenQueue.append(self.currentToken) | ||
1328 | self.state = self.dataState | ||
1329 | else: | ||
1330 | self.currentToken["name"] = data | ||
1331 | self.state = self.doctypeNameState | ||
1332 | return True | ||
1333 | |||
1334 | def doctypeNameState(self): | ||
1335 | data = self.stream.char() | ||
1336 | if data in spaceCharacters: | ||
1337 | self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | ||
1338 | self.state = self.afterDoctypeNameState | ||
1339 | elif data == ">": | ||
1340 | self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | ||
1341 | self.tokenQueue.append(self.currentToken) | ||
1342 | self.state = self.dataState | ||
1343 | elif data == "\u0000": | ||
1344 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1345 | "data": "invalid-codepoint"}) | ||
1346 | self.currentToken["name"] += "\uFFFD" | ||
1347 | self.state = self.doctypeNameState | ||
1348 | elif data is EOF: | ||
1349 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1350 | "eof-in-doctype-name"}) | ||
1351 | self.currentToken["correct"] = False | ||
1352 | self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | ||
1353 | self.tokenQueue.append(self.currentToken) | ||
1354 | self.state = self.dataState | ||
1355 | else: | ||
1356 | self.currentToken["name"] += data | ||
1357 | return True | ||
1358 | |||
1359 | def afterDoctypeNameState(self): | ||
1360 | data = self.stream.char() | ||
1361 | if data in spaceCharacters: | ||
1362 | pass | ||
1363 | elif data == ">": | ||
1364 | self.tokenQueue.append(self.currentToken) | ||
1365 | self.state = self.dataState | ||
1366 | elif data is EOF: | ||
1367 | self.currentToken["correct"] = False | ||
1368 | self.stream.unget(data) | ||
1369 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1370 | "eof-in-doctype"}) | ||
1371 | self.tokenQueue.append(self.currentToken) | ||
1372 | self.state = self.dataState | ||
1373 | else: | ||
1374 | if data in ("p", "P"): | ||
1375 | matched = True | ||
1376 | for expected in (("u", "U"), ("b", "B"), ("l", "L"), | ||
1377 | ("i", "I"), ("c", "C")): | ||
1378 | data = self.stream.char() | ||
1379 | if data not in expected: | ||
1380 | matched = False | ||
1381 | break | ||
1382 | if matched: | ||
1383 | self.state = self.afterDoctypePublicKeywordState | ||
1384 | return True | ||
1385 | elif data in ("s", "S"): | ||
1386 | matched = True | ||
1387 | for expected in (("y", "Y"), ("s", "S"), ("t", "T"), | ||
1388 | ("e", "E"), ("m", "M")): | ||
1389 | data = self.stream.char() | ||
1390 | if data not in expected: | ||
1391 | matched = False | ||
1392 | break | ||
1393 | if matched: | ||
1394 | self.state = self.afterDoctypeSystemKeywordState | ||
1395 | return True | ||
1396 | |||
1397 | # All the characters read before the current 'data' will be | ||
1398 | # [a-zA-Z], so they're garbage in the bogus doctype and can be | ||
1399 | # discarded; only the latest character might be '>' or EOF | ||
1400 | # and needs to be ungetted | ||
1401 | self.stream.unget(data) | ||
1402 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1403 | "expected-space-or-right-bracket-in-doctype", "datavars": | ||
1404 | {"data": data}}) | ||
1405 | self.currentToken["correct"] = False | ||
1406 | self.state = self.bogusDoctypeState | ||
1407 | |||
1408 | return True | ||
1409 | |||
1410 | def afterDoctypePublicKeywordState(self): | ||
1411 | data = self.stream.char() | ||
1412 | if data in spaceCharacters: | ||
1413 | self.state = self.beforeDoctypePublicIdentifierState | ||
1414 | elif data in ("'", '"'): | ||
1415 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1416 | "unexpected-char-in-doctype"}) | ||
1417 | self.stream.unget(data) | ||
1418 | self.state = self.beforeDoctypePublicIdentifierState | ||
1419 | elif data is EOF: | ||
1420 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1421 | "eof-in-doctype"}) | ||
1422 | self.currentToken["correct"] = False | ||
1423 | self.tokenQueue.append(self.currentToken) | ||
1424 | self.state = self.dataState | ||
1425 | else: | ||
1426 | self.stream.unget(data) | ||
1427 | self.state = self.beforeDoctypePublicIdentifierState | ||
1428 | return True | ||
1429 | |||
1430 | def beforeDoctypePublicIdentifierState(self): | ||
1431 | data = self.stream.char() | ||
1432 | if data in spaceCharacters: | ||
1433 | pass | ||
1434 | elif data == "\"": | ||
1435 | self.currentToken["publicId"] = "" | ||
1436 | self.state = self.doctypePublicIdentifierDoubleQuotedState | ||
1437 | elif data == "'": | ||
1438 | self.currentToken["publicId"] = "" | ||
1439 | self.state = self.doctypePublicIdentifierSingleQuotedState | ||
1440 | elif data == ">": | ||
1441 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1442 | "unexpected-end-of-doctype"}) | ||
1443 | self.currentToken["correct"] = False | ||
1444 | self.tokenQueue.append(self.currentToken) | ||
1445 | self.state = self.dataState | ||
1446 | elif data is EOF: | ||
1447 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1448 | "eof-in-doctype"}) | ||
1449 | self.currentToken["correct"] = False | ||
1450 | self.tokenQueue.append(self.currentToken) | ||
1451 | self.state = self.dataState | ||
1452 | else: | ||
1453 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1454 | "unexpected-char-in-doctype"}) | ||
1455 | self.currentToken["correct"] = False | ||
1456 | self.state = self.bogusDoctypeState | ||
1457 | return True | ||
1458 | |||
1459 | def doctypePublicIdentifierDoubleQuotedState(self): | ||
1460 | data = self.stream.char() | ||
1461 | if data == "\"": | ||
1462 | self.state = self.afterDoctypePublicIdentifierState | ||
1463 | elif data == "\u0000": | ||
1464 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1465 | "data": "invalid-codepoint"}) | ||
1466 | self.currentToken["publicId"] += "\uFFFD" | ||
1467 | elif data == ">": | ||
1468 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1469 | "unexpected-end-of-doctype"}) | ||
1470 | self.currentToken["correct"] = False | ||
1471 | self.tokenQueue.append(self.currentToken) | ||
1472 | self.state = self.dataState | ||
1473 | elif data is EOF: | ||
1474 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1475 | "eof-in-doctype"}) | ||
1476 | self.currentToken["correct"] = False | ||
1477 | self.tokenQueue.append(self.currentToken) | ||
1478 | self.state = self.dataState | ||
1479 | else: | ||
1480 | self.currentToken["publicId"] += data | ||
1481 | return True | ||
1482 | |||
1483 | def doctypePublicIdentifierSingleQuotedState(self): | ||
1484 | data = self.stream.char() | ||
1485 | if data == "'": | ||
1486 | self.state = self.afterDoctypePublicIdentifierState | ||
1487 | elif data == "\u0000": | ||
1488 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1489 | "data": "invalid-codepoint"}) | ||
1490 | self.currentToken["publicId"] += "\uFFFD" | ||
1491 | elif data == ">": | ||
1492 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1493 | "unexpected-end-of-doctype"}) | ||
1494 | self.currentToken["correct"] = False | ||
1495 | self.tokenQueue.append(self.currentToken) | ||
1496 | self.state = self.dataState | ||
1497 | elif data is EOF: | ||
1498 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1499 | "eof-in-doctype"}) | ||
1500 | self.currentToken["correct"] = False | ||
1501 | self.tokenQueue.append(self.currentToken) | ||
1502 | self.state = self.dataState | ||
1503 | else: | ||
1504 | self.currentToken["publicId"] += data | ||
1505 | return True | ||
1506 | |||
1507 | def afterDoctypePublicIdentifierState(self): | ||
1508 | data = self.stream.char() | ||
1509 | if data in spaceCharacters: | ||
1510 | self.state = self.betweenDoctypePublicAndSystemIdentifiersState | ||
1511 | elif data == ">": | ||
1512 | self.tokenQueue.append(self.currentToken) | ||
1513 | self.state = self.dataState | ||
1514 | elif data == '"': | ||
1515 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1516 | "unexpected-char-in-doctype"}) | ||
1517 | self.currentToken["systemId"] = "" | ||
1518 | self.state = self.doctypeSystemIdentifierDoubleQuotedState | ||
1519 | elif data == "'": | ||
1520 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1521 | "unexpected-char-in-doctype"}) | ||
1522 | self.currentToken["systemId"] = "" | ||
1523 | self.state = self.doctypeSystemIdentifierSingleQuotedState | ||
1524 | elif data is EOF: | ||
1525 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1526 | "eof-in-doctype"}) | ||
1527 | self.currentToken["correct"] = False | ||
1528 | self.tokenQueue.append(self.currentToken) | ||
1529 | self.state = self.dataState | ||
1530 | else: | ||
1531 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1532 | "unexpected-char-in-doctype"}) | ||
1533 | self.currentToken["correct"] = False | ||
1534 | self.state = self.bogusDoctypeState | ||
1535 | return True | ||
1536 | |||
1537 | def betweenDoctypePublicAndSystemIdentifiersState(self): | ||
1538 | data = self.stream.char() | ||
1539 | if data in spaceCharacters: | ||
1540 | pass | ||
1541 | elif data == ">": | ||
1542 | self.tokenQueue.append(self.currentToken) | ||
1543 | self.state = self.dataState | ||
1544 | elif data == '"': | ||
1545 | self.currentToken["systemId"] = "" | ||
1546 | self.state = self.doctypeSystemIdentifierDoubleQuotedState | ||
1547 | elif data == "'": | ||
1548 | self.currentToken["systemId"] = "" | ||
1549 | self.state = self.doctypeSystemIdentifierSingleQuotedState | ||
1550 | elif data == EOF: | ||
1551 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1552 | "eof-in-doctype"}) | ||
1553 | self.currentToken["correct"] = False | ||
1554 | self.tokenQueue.append(self.currentToken) | ||
1555 | self.state = self.dataState | ||
1556 | else: | ||
1557 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1558 | "unexpected-char-in-doctype"}) | ||
1559 | self.currentToken["correct"] = False | ||
1560 | self.state = self.bogusDoctypeState | ||
1561 | return True | ||
1562 | |||
1563 | def afterDoctypeSystemKeywordState(self): | ||
1564 | data = self.stream.char() | ||
1565 | if data in spaceCharacters: | ||
1566 | self.state = self.beforeDoctypeSystemIdentifierState | ||
1567 | elif data in ("'", '"'): | ||
1568 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1569 | "unexpected-char-in-doctype"}) | ||
1570 | self.stream.unget(data) | ||
1571 | self.state = self.beforeDoctypeSystemIdentifierState | ||
1572 | elif data is EOF: | ||
1573 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1574 | "eof-in-doctype"}) | ||
1575 | self.currentToken["correct"] = False | ||
1576 | self.tokenQueue.append(self.currentToken) | ||
1577 | self.state = self.dataState | ||
1578 | else: | ||
1579 | self.stream.unget(data) | ||
1580 | self.state = self.beforeDoctypeSystemIdentifierState | ||
1581 | return True | ||
1582 | |||
1583 | def beforeDoctypeSystemIdentifierState(self): | ||
1584 | data = self.stream.char() | ||
1585 | if data in spaceCharacters: | ||
1586 | pass | ||
1587 | elif data == "\"": | ||
1588 | self.currentToken["systemId"] = "" | ||
1589 | self.state = self.doctypeSystemIdentifierDoubleQuotedState | ||
1590 | elif data == "'": | ||
1591 | self.currentToken["systemId"] = "" | ||
1592 | self.state = self.doctypeSystemIdentifierSingleQuotedState | ||
1593 | elif data == ">": | ||
1594 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1595 | "unexpected-char-in-doctype"}) | ||
1596 | self.currentToken["correct"] = False | ||
1597 | self.tokenQueue.append(self.currentToken) | ||
1598 | self.state = self.dataState | ||
1599 | elif data is EOF: | ||
1600 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1601 | "eof-in-doctype"}) | ||
1602 | self.currentToken["correct"] = False | ||
1603 | self.tokenQueue.append(self.currentToken) | ||
1604 | self.state = self.dataState | ||
1605 | else: | ||
1606 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1607 | "unexpected-char-in-doctype"}) | ||
1608 | self.currentToken["correct"] = False | ||
1609 | self.state = self.bogusDoctypeState | ||
1610 | return True | ||
1611 | |||
1612 | def doctypeSystemIdentifierDoubleQuotedState(self): | ||
1613 | data = self.stream.char() | ||
1614 | if data == "\"": | ||
1615 | self.state = self.afterDoctypeSystemIdentifierState | ||
1616 | elif data == "\u0000": | ||
1617 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1618 | "data": "invalid-codepoint"}) | ||
1619 | self.currentToken["systemId"] += "\uFFFD" | ||
1620 | elif data == ">": | ||
1621 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1622 | "unexpected-end-of-doctype"}) | ||
1623 | self.currentToken["correct"] = False | ||
1624 | self.tokenQueue.append(self.currentToken) | ||
1625 | self.state = self.dataState | ||
1626 | elif data is EOF: | ||
1627 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1628 | "eof-in-doctype"}) | ||
1629 | self.currentToken["correct"] = False | ||
1630 | self.tokenQueue.append(self.currentToken) | ||
1631 | self.state = self.dataState | ||
1632 | else: | ||
1633 | self.currentToken["systemId"] += data | ||
1634 | return True | ||
1635 | |||
1636 | def doctypeSystemIdentifierSingleQuotedState(self): | ||
1637 | data = self.stream.char() | ||
1638 | if data == "'": | ||
1639 | self.state = self.afterDoctypeSystemIdentifierState | ||
1640 | elif data == "\u0000": | ||
1641 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1642 | "data": "invalid-codepoint"}) | ||
1643 | self.currentToken["systemId"] += "\uFFFD" | ||
1644 | elif data == ">": | ||
1645 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1646 | "unexpected-end-of-doctype"}) | ||
1647 | self.currentToken["correct"] = False | ||
1648 | self.tokenQueue.append(self.currentToken) | ||
1649 | self.state = self.dataState | ||
1650 | elif data is EOF: | ||
1651 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1652 | "eof-in-doctype"}) | ||
1653 | self.currentToken["correct"] = False | ||
1654 | self.tokenQueue.append(self.currentToken) | ||
1655 | self.state = self.dataState | ||
1656 | else: | ||
1657 | self.currentToken["systemId"] += data | ||
1658 | return True | ||
1659 | |||
1660 | def afterDoctypeSystemIdentifierState(self): | ||
1661 | data = self.stream.char() | ||
1662 | if data in spaceCharacters: | ||
1663 | pass | ||
1664 | elif data == ">": | ||
1665 | self.tokenQueue.append(self.currentToken) | ||
1666 | self.state = self.dataState | ||
1667 | elif data is EOF: | ||
1668 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1669 | "eof-in-doctype"}) | ||
1670 | self.currentToken["correct"] = False | ||
1671 | self.tokenQueue.append(self.currentToken) | ||
1672 | self.state = self.dataState | ||
1673 | else: | ||
1674 | self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
1675 | "unexpected-char-in-doctype"}) | ||
1676 | self.state = self.bogusDoctypeState | ||
1677 | return True | ||
1678 | |||
1679 | def bogusDoctypeState(self): | ||
1680 | data = self.stream.char() | ||
1681 | if data == ">": | ||
1682 | self.tokenQueue.append(self.currentToken) | ||
1683 | self.state = self.dataState | ||
1684 | elif data is EOF: | ||
1685 | # XXX EMIT | ||
1686 | self.stream.unget(data) | ||
1687 | self.tokenQueue.append(self.currentToken) | ||
1688 | self.state = self.dataState | ||
1689 | else: | ||
1690 | pass | ||
1691 | return True | ||
1692 | |||
1693 | def cdataSectionState(self): | ||
1694 | data = [] | ||
1695 | while True: | ||
1696 | data.append(self.stream.charsUntil("]")) | ||
1697 | data.append(self.stream.charsUntil(">")) | ||
1698 | char = self.stream.char() | ||
1699 | if char == EOF: | ||
1700 | break | ||
1701 | else: | ||
1702 | assert char == ">" | ||
1703 | if data[-1][-2:] == "]]": | ||
1704 | data[-1] = data[-1][:-2] | ||
1705 | break | ||
1706 | else: | ||
1707 | data.append(char) | ||
1708 | |||
1709 | data = "".join(data) # pylint:disable=redefined-variable-type | ||
1710 | # Deal with null here rather than in the parser | ||
1711 | nullCount = data.count("\u0000") | ||
1712 | if nullCount > 0: | ||
1713 | for _ in range(nullCount): | ||
1714 | self.tokenQueue.append({"type": tokenTypes["ParseError"], | ||
1715 | "data": "invalid-codepoint"}) | ||
1716 | data = data.replace("\u0000", "\uFFFD") | ||
1717 | if data: | ||
1718 | self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
1719 | "data": data}) | ||
1720 | self.state = self.dataState | ||
1721 | return True | ||